### Depuración de los datos extraidos de la API

In [2]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
import os
from time import sleep
import json
import re

import warnings
warnings.filterwarnings("ignore")

import pickle as pkl

from pprint import pprint

from libraries.preprocess import preprocess

In [3]:
# Ruta de la carpeta contenedora de los .json y listado de la misma
os.chdir(r"C:\Users\germa\Desktop\HACK A BOSS\proyecto_job-salary\data\data_1")
os.listdir()

['64e341dd7d171a5eba5f3c42.json',
 '64f4ca7ad5b1c59dfa1e7732.json',
 '64f4ca7c2f542e4224aab4be.json',
 '64f4ca7f47ccf1c95f8d12fa.json',
 '64f4ca819bddf7898350eaed.json',
 '64f4ca84a4be105f74c831b3.json',
 '64f4ca883944b072edaf6675.json',
 '64f4ca89654a8cab87d4f9d8.json',
 '64f4ca91ce87f8713d381bc5.json',
 '64f4ca93d3aba2853359280b.json',
 '64f4ca948cd878d2e0f68b1f.json',
 '64f4ca9637b24a4d8d864be5.json',
 '64f4ca9b58762b793afe6c9c.json',
 '64f4ca9edefa1370d22665eb.json',
 '64f4ca9f66d81f806a8cc83c.json',
 '64f4caa11d64ea5d0f8d39d2.json',
 '64f4caa4ea144277cc6714c1.json',
 '64f4caa6c640d29204d4f305.json',
 '64f4caa84b91eb344a98f1e8.json',
 '64f4caaab5961ef25aa8da1f.json',
 '64f4caac8ccee041d362e327.json',
 '64f4caaf95bf92a0b5aef06d.json',
 '64f4cab08e02e477926dec14.json',
 '64f4cab1969c0887bef341d7.json',
 '64f4cab45b54ef90f18052aa.json',
 '64f4cab52fe3022dda438416.json',
 '64f4cab811c8816a1f348346.json',
 '64f4cab9edb03482b22720fa.json',
 '64f4cabb1988e567debe4aa8.json',
 '64f4cabec56d

In [5]:
%%time

# Creamos un DataFrame con los datos contenidos en cada .json
dates = list()

df = pd.DataFrame()

for json_ in os.listdir():
    
    with open(json_, "br") as file:
        data = json.load(file) # Usar pickle arrojaba un error que json soluciona
        
    date = datetime.strptime(data["search_metadata"]["created_at"], "%Y-%m-%d %H:%M:%S UTC").date()

    try:

        df_ = pd.json_normalize(data["jobs_results"])

        for i in range(df_.shape[0]):

            dates.append(date)

        df = pd.concat([df, df_], ignore_index = True)

    except:
        pass


print(df.shape, len(dates))

os.chdir("../..")

df.to_csv("data/raw_data.csv", index = False, sep = ",") # Guardamos el DataFrame con todos los datos en bruto, (213303, 15)

with open("data/dates.pkl", "bw") as file: # Guardamos las fechas de la publicación de las búsquedas
    pkl.dump(dates, file)

In [3]:
# Cargamos el DataFrame con todos los datos en bruto y las fechas de publicación de los mismos
df = pd.read_csv("data/raw_data.csv")

with open("data/dates.pkl", "br") as file:
    dates = pkl.load(file)
    
df.shape

(213303, 15)

In [29]:
# Creamos "date_posted"
df["date_posted"] = dates

# Eliminamos duplicados
df = df.drop_duplicates("job_id").reset_index(drop = True) 

# Limpiamos "via"
for i in range(len(df["via"])):
    
    # Debido a valores NoneType, usamos try/except
    
    try:
        df.loc[i, "via"] = clean.clean_source(df.loc[i, "via"]) # Cada fila
        
    except:
        df.loc[i, "via"] = np.nan
    
# Limpiamos "location"
df["location"] = df["location"].apply(lambda x : clean.clean_location(x))

# Limpiamos "contract_type"
df["detected_extensions.schedule_type"] = df["detected_extensions.schedule_type"].apply(lambda x : clean.clean_contract_type(x))

# Limpiamos "created_date"
df["detected_extensions.posted_at"] = df["detected_extensions.posted_at"].apply(lambda x : clean.transform_date(x))

df["date_posted"] = [clean.get_date(x, y) for x, y in df[["date_posted", "detected_extensions.posted_at"]].values]

# Creamos "tech_skills"
for i in range(len(df["description"])):
    
    # Debido a valores NoneType, usamos try/except
    
    try:
        df.loc[i, "tech_skills"] = clean.get_skills(df.loc[i, "description"]) # Cada fila
        
    except:
        df.loc[i, "tech_skills"] = np.nan

### ESPAÑA

In [445]:
# Filtramos el DataFrame por los trabajos cuyo país fuese España
lista_str_ = ["Spain", "España"]

for str_ in lista_str_:
    
    if str_ == "Spain":
        df_1 = df[df['location'].str.contains(str_, na = False)]
        
    elif str_ == "España":
        df_2 = df[df['location'].str.contains(str_, na = False)]
        
df_spain = pd.concat([df_1, df_2])

del df_1, df_2

# Guardamos dataset en bruto
df_spain.to_csv("data/spain_raw_data.csv", index = False, sep = ",")

In [446]:
df_spain

Unnamed: 0,title,company_name,location,via,description,job_highlights,related_links,thumbnail,extensions,job_id,detected_extensions.posted_at,detected_extensions.schedule_type,detected_extensions.work_from_home,detected_extensions.salary,detected_extensions.commute_time,date_posted,tech_skills
1,Software Developer (English - Functional Progr...,Stack Builders,"Madrid, Spain",Jobs By Workable,Welcome to Stack Builders! We’re an empathetic...,"[{'items': [""Welcome to Stack Builders! We’re ...",[{'link': 'https://www.google.com/search?sca_e...,,"['28 days ago', 'Full-time']",eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBEZXZlbG9wZXIgKE...,28 days,Full-time,,,,2023-08-06,Ruby
2,Senior Software Developer,"_external, Lumen","Madrid, Spain",Lumen Jobs,About Lumen\nLumen is guided by our belief tha...,[{'items': ['About Lumen\nLumen is guided by o...,[{'link': 'https://www.google.com/search?sca_e...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['Full-time', 'No degree mentioned']",eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRGV2ZW...,NaT,Full-time,,,,2023-09-03,
4,SAP CPQ Software Developer (F/M),Sartorius,"Madrid, Spain",Sartorius,We are looking for an experienced Software Dev...,"[{'items': [""We are looking for an experienced...","[{'link': 'http://www.sartorius.com/', 'text':...",,['Full-time'],eyJqb2JfdGl0bGUiOiJTQVAgQ1BRIFNvZnR3YXJlIERldm...,NaT,Full-time,,,,2023-09-03,
5,"Software Engineer, Developer Productivity",Affirm,"Madrid, Spain",Greenhouse,Developer Productivity engineering’s purpose i...,"[{'items': [""Developer Productivity engineerin...","[{'link': 'http://www.affirm.com/', 'text': 'a...",https://encrypted-tbn0.gstatic.com/images?q=tb...,"['10 days ago', 'Full-time']",eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlciwgRG...,10 days,Full-time,,,,2023-08-24,
6,"Senior Software Engineer, Competitive Intellig...","Okta, Inc.",Spain,Okta,Get to know Okta\nOkta is The World’s Identity...,[{'items': ['Get to know Okta\nOkta is The Wor...,"[{'link': 'http://www.okta.com/', 'text': 'okt...",,"['7 days ago', 'Full-time', 'No degree mention...",eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRW5naW...,7 days,Full-time,,,,2023-08-27,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76688,Arquitecto Azure Cloud,TD SYNNEX,"Palma, España",Jobrapido.com,¡Es muy gratificante trabajar en una empresa e...,[{'items': ['¡Es muy gratificante trabajar en ...,"[{'link': 'https://www.synnex.com/', 'text': '...",https://encrypted-tbn0.gstatic.com/images?q=tb...,['Tiempo completo'],eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIEF6dXJlIENsb3...,NaT,Tiempo completo,,,,2023-09-03,[Azure]
76689,Arquitecto Cloud,Azertium IT Global Services SL,"Madrid, España",BeBee,Desde Azertium IT nos gustaría contar con los ...,[{'items': ['Desde Azertium IT nos gustaría co...,[{'link': 'https://www.google.com/search?sca_e...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['hace 3 días', 'Tiempo completo']",eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,3 days,Tiempo completo,,,,2023-08-31,
76690,Arquitecto Cloud,UST,"Madrid, España",BeBee,Seguimos buscando talento...y nos encantaría q...,[{'items': ['Seguimos buscando talento...y nos...,[{'link': 'https://www.google.com/search?sca_e...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"['hace 3 días', 'Tiempo completo']",eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,3 days,Tiempo completo,,,,2023-08-31,
76691,Arquitecto Cloud,Decide Soluciones,"Madrid, España",BeBee,¿A quién buscamos?\n\nActualmente buscamos un ...,[{'items': ['¿A quién buscamos?\n\nActualmente...,"[{'link': 'http://www.decidesoluciones.es/en',...",,"['hace 8 días', 'Tiempo completo']",eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,8 days,Tiempo completo,,,,2023-08-26,


#### Limpieza "df_spain"

In [447]:
# Renombramos los nombres de las columnas
df_spain = clean.clean_column_names(df_spain)

In [448]:
# Nos quedamos con las columnas que nos interesan
df_spain = df_spain[["job_id", "title", "company_name", "location", "source", 
                     "description", "job_highlights", "date_posted", "contract_type",
                     "tech_skills"]]

In [449]:
df_spain

Unnamed: 0,job_id,title,company_name,location,source,description,job_highlights,date_posted,contract_type,tech_skills
1,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBEZXZlbG9wZXIgKE...,Software Developer (English - Functional Progr...,Stack Builders,"Madrid, Spain",Jobs By Workable,Welcome to Stack Builders! We’re an empathetic...,"[{'items': [""Welcome to Stack Builders! We’re ...",2023-08-06,Full-time,Ruby
2,eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRGV2ZW...,Senior Software Developer,"_external, Lumen","Madrid, Spain",Lumen Jobs,About Lumen\nLumen is guided by our belief tha...,[{'items': ['About Lumen\nLumen is guided by o...,2023-09-03,Full-time,
4,eyJqb2JfdGl0bGUiOiJTQVAgQ1BRIFNvZnR3YXJlIERldm...,SAP CPQ Software Developer (F/M),Sartorius,"Madrid, Spain",Sartorius,We are looking for an experienced Software Dev...,"[{'items': [""We are looking for an experienced...",2023-09-03,Full-time,
5,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlciwgRG...,"Software Engineer, Developer Productivity",Affirm,"Madrid, Spain",Greenhouse,Developer Productivity engineering’s purpose i...,"[{'items': [""Developer Productivity engineerin...",2023-08-24,Full-time,
6,eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRW5naW...,"Senior Software Engineer, Competitive Intellig...","Okta, Inc.",Spain,Okta,Get to know Okta\nOkta is The World’s Identity...,[{'items': ['Get to know Okta\nOkta is The Wor...,2023-08-27,Full-time,
...,...,...,...,...,...,...,...,...,...,...
76688,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIEF6dXJlIENsb3...,Arquitecto Azure Cloud,TD SYNNEX,"Palma, España",Jobrapido.com,¡Es muy gratificante trabajar en una empresa e...,[{'items': ['¡Es muy gratificante trabajar en ...,2023-09-03,Tiempo completo,[Azure]
76689,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,Arquitecto Cloud,Azertium IT Global Services SL,"Madrid, España",BeBee,Desde Azertium IT nos gustaría contar con los ...,[{'items': ['Desde Azertium IT nos gustaría co...,2023-08-31,Tiempo completo,
76690,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,Arquitecto Cloud,UST,"Madrid, España",BeBee,Seguimos buscando talento...y nos encantaría q...,[{'items': ['Seguimos buscando talento...y nos...,2023-08-31,Tiempo completo,
76691,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,Arquitecto Cloud,Decide Soluciones,"Madrid, España",BeBee,¿A quién buscamos?\n\nActualmente buscamos un ...,[{'items': ['¿A quién buscamos?\n\nActualmente...,2023-08-26,Tiempo completo,


In [450]:
# Perfil y especialidades de los trabajos
df_perfil_especialidad = pd.read_csv("data/StackMinimo - StackTec.csv", skiprows = 2).drop("Unnamed: 0", axis = 1)

df_perfil_especialidad

Unnamed: 0,Codigo,N1 Perfil,N2 Especialidad,N3 Tecnología predominante
0,DESBACK,Desarrollador,Backend,.Net
1,,,,Java
2,,,,NodeJs
3,,,,Phyton
4,,,,QuickBase
5,,,,"Lanbda, Azure Functions, GCP Functions"
6,DESBD,Desarrollador,Base de datos,Oracle Forms
7,,,,SQL
8,,,,SAP
9,DESAPI,Desarrollador,APIs,All


In [451]:
list_especialidad = df_perfil_especialidad["N2 Especialidad"].dropna().to_list()

In [452]:
list_especialidad = ["Backend",
                     "Base de datos",
                     "Bases de datos",
                     "APIs",
                     "Mobile",
                     "Frontend Movil",
                     "Frontend Web",
                     "Full Stack",
                     "Integración",
                     "Liferay",
                     "Power BI",
                     "Software Release Engineer",
                     "SRE",
                     "Diseñador Gráfico",
                     "Graphic Designer",
                     "Infraestructura cloud",
                     "Estructuras cloud",
                     "Sistemas Operativos",
                     "Seguridad",
                     "Mantenimiento y Soporte",
                     "Administrador de Base de datos",
                     "Servidores y aplicaciones",
                     "Tester funcional",
                     "Tester automatizada",
                     "SCRUM Master",
                     "Product Owner",
                     "Customer Success",
                     "Gerente de Proyecto",
                     "Analisis de Datos",
                     "Analista de Datos"]

In [453]:
dict_perfiles = {v : k for k, v in df_perfil_especialidad[["N1 Perfil ", "N2 Especialidad"]].dropna().values}

In [454]:
# Corrección de algunas keys
dict_perfiles = {'Backend': 'Desarrollador',
                 'Base de datos': 'Desarrollador',
                 'APIs': 'Desarrollador',
                 'Mobile': 'Desarrollador',
                 'Frontend Movil': 'Desarrollador',
                 'Frontend Web': 'Desarrollador',
                 'Full Stack': 'Desarrollador',
                 'Integración': 'Desarrollador',
                 'Liferay': 'Desarrollador',
                 'Power BI': 'Desarrollador',
                 'Software Release Engineer': 'Devops',
                 'Diseñador Gráfico': 'Diseñador',
                 'Infraestructura cloud': 'Infraestructura',
                 'Sistemas Operativos': 'Infraestructura',
                 'Seguridad': 'Infraestructura',
                 'Mantenimiento y Soporte': 'Infraestructura',
                 'Tester funcional': 'Quality Assurance',
                 'Tester automatizada': 'Quality Assurance',
                 'SCRUM Master': 'Gestión Operativa',
                 'Product Owner': 'Gestión Operativa',
                 'Customer Success': 'Gestión Operativa',
                 'Gerente de Proyecto': 'Gestión Operativa',
                 'Analisis de Datos': 'Especialista'}

In [455]:
# Funciones para generar las columnas de perfil y especialidad
def get_especialidad(string, list_especialidad):
    
    especialidades = list({especialidad for especialidad in list_especialidad if especialidad.lower() in string.lower()})
    
    return especialidades if especialidades else np.nan

def get_perfil(lista, dict_perfiles):
    
    perfiles = list({v for k, v in dict_perfiles.items() if k in lista})
    
    return perfiles if perfiles else np.nan

In [456]:
# Funciones para generar las columnas de años de experiencia y nivel de experiencia
def find_years_of_experience(string: str):
    
    list_strings = ["años de", "years of", "years experience", "años experiencia"]
    
    string = string.lower()
    
    years = [string[string.find(s) - 5 : string.find(s) + len(s) + 1] for s in list_strings if string.find(s) != -1]
    
    numeros = [re.findall(r"\d+", y) for y in years]

    numeros = [[int(n) for n in num if 0 < int(n) < 13] for num in numeros]
    
    numeros = [max(num) if num else np.nan for num in numeros]

    return max(numeros) if numeros else np.nan


def experience_level(num):
    
    if not pd.isna(num):
    
        if num < 2:
            return "Junior"
        elif num <= 4:
            return "Semi-Senior"
        elif num < 8:
            return "Senior"
        else:
            return "Leader"
        
    else:
        return np.nan

In [457]:
df_spain["job_specialization"] = df_spain["description"].apply(lambda x : get_especialidad(x, list_especialidad = list_especialidad) if not pd.isna(x) else None)
df_spain["job_profile"] = df_spain["description"].apply(lambda x : get_perfil(x, dict_perfiles = dict_perfiles) if not pd.isna(x) else None)

In [458]:
df_spain["experience"] = df_spain["description"].apply(lambda x : find_years_of_experience(x) if not pd.isna(x) else x)
df_spain["experience_level"] = df_spain["experience"].apply(lambda x : experience_level(x))

In [459]:
# Actualizamos "contract_type"
df_spain["contract_type"] = df_spain["contract_type"]\
                            .apply(lambda x : "Full-time" if x == "Tiempo completo" else x)

# Creamos "remote_work"
def get_remote_work(string):
    
    resultados = list()
    
    if "remoto" in string or "remote work" in string or "remote" in string:
        
        resultados.append("Remoto")
        
    elif "hibrido" in string or "hybrid" in string or "híbrido" in string:
        
        resultados.append("Hibrido")
        
    elif "presencial" in string or "in-office" in string:
        
        resultados.append("Presencial")
        
    else:
        return np.nan
        
    return resultados

df_spain["remote_work"] = df_spain["description"].apply(lambda x : get_remote_work(x))

# Actualizamos "location"
# ¡PENDIENTE, CONSULTAR CON DANI!
# ACTUALIZADO en la librería de preprocesamiento para el pipeline

In [460]:
df_spain = df_spain[["job_id", "experience", "experience_level", "description", "job_specialization",
                     "job_profile", "remote_work", "tech_skills", "title", "company_name", "location", "source",
                     "date_posted", "contract_type"]]

# Guardamos los datos limpios (falta alguna que otra corrección)
df_spain.to_csv("data/spain_cleaned_data.csv", index = False, sep = ",")

In [461]:
df_spain

Unnamed: 0,job_id,country,experience,experience_level,description,job_specialization,job_profile,remote_work,tech_skills,title,company_name,location,source,date_posted,contract_type
1,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBEZXZlbG9wZXIgKE...,Spain,,,Welcome to Stack Builders! We’re an empathetic...,,,[Remoto],Ruby,Software Developer (English - Functional Progr...,Stack Builders,"Madrid, Spain",Jobs By Workable,2023-08-06,Full-time
2,eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRGV2ZW...,Spain,,,About Lumen\nLumen is guided by our belief tha...,,,,,Senior Software Developer,"_external, Lumen","Madrid, Spain",Lumen Jobs,2023-09-03,Full-time
4,eyJqb2JfdGl0bGUiOiJTQVAgQ1BRIFNvZnR3YXJlIERldm...,Spain,,,We are looking for an experienced Software Dev...,"[Product Owner, APIs]","[Desarrollador, Gestión Operativa]",[Hibrido],,SAP CPQ Software Developer (F/M),Sartorius,"Madrid, Spain",Sartorius,2023-09-03,Full-time
5,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlciwgRG...,Spain,,,Developer Productivity engineering’s purpose i...,,,,,"Software Engineer, Developer Productivity",Affirm,"Madrid, Spain",Greenhouse,2023-08-24,Full-time
6,eyJqb2JfdGl0bGUiOiJTZW5pb3IgU29mdHdhcmUgRW5naW...,Spain,,,Get to know Okta\nOkta is The World’s Identity...,,,,,"Senior Software Engineer, Competitive Intellig...","Okta, Inc.",Spain,Okta,2023-08-27,Full-time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76688,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIEF6dXJlIENsb3...,España,2.0,Semi-Senior,¡Es muy gratificante trabajar en una empresa e...,[Seguridad],,[Hibrido],[Azure],Arquitecto Azure Cloud,TD SYNNEX,"Palma, España",Jobrapido.com,2023-09-03,Full-time
76689,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,España,,,Desde Azertium IT nos gustaría contar con los ...,[Bases de datos],,,,Arquitecto Cloud,Azertium IT Global Services SL,"Madrid, España",BeBee,2023-08-31,Full-time
76690,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,España,,,Seguimos buscando talento...y nos encantaría q...,,,,,Arquitecto Cloud,UST,"Madrid, España",BeBee,2023-08-31,Full-time
76691,eyJqb2JfdGl0bGUiOiJBcnF1aXRlY3RvIENsb3VkIiwiaH...,España,,,¿A quién buscamos?\n\nActualmente buscamos un ...,"[Integración, Seguridad]",[Desarrollador],,,Arquitecto Cloud,Decide Soluciones,"Madrid, España",BeBee,2023-08-26,Full-time
