# DATAFRAME SOCIAL - Limpieza

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sidetable as stb

### Una vez explorado nuestro DataFrame, con las columnas ya seleccionadas, hacemos la limpieza de datos oportuna.

In [22]:
df_social = pd.read_csv('../Datos/df-social-sucio.csv', index_col=0)

In [23]:
df_social.head()

Unnamed: 0,genero,edad,pais,nivel de estudios,puesto de trabajo,sector,tamaño de la compañia
0,Man,50-54,Indonesia,Master’s degree,Program/Project Manager,Manufacturing/Fabrication,"1000-9,999 employees"
1,Man,22-24,Pakistan,Master’s degree,Software Engineer,Academics/Education,"1000-9,999 employees"
2,Man,45-49,Mexico,Doctoral degree,Research Scientist,Academics/Education,"1000-9,999 employees"
3,Man,45-49,India,Doctoral degree,Other,Academics/Education,50-249 employees
4,Woman,25-29,India,I prefer not to answer,Currently not employed,,


### Creamos una nueva columna con los continentes usando "apply()".

In [24]:
continente1 = ['India', 'Turkey','Viet Nam', 'Bangladesh', 'Indonesia', 
        'Singapore', 'Taiwan', 'Hong Kong (S.A.R.)', 'Israel', 'Pakistan', 'Philippines', 'United Arab Emirates', 'Thailand', 'Japan', 'China', 'South Korea', 
       'Saudi Arabia', 'Malaysia', 'Kazakhstan', 'Nepal']

continente2 = ['Iran, Islamic Republic of...', 'Egypt','Morocco','Tunisia','Nigeria','Kenya','South Africa',
       'Algeria','Ghana','Uganda', 'Ethiopia', 'Iraq']

continente3 = ['Poland','Russia', 'Italy', 'Sweden',
       'Ireland', 'United Kingdom of Great Britain and Northern Ireland','Spain',
       'Germany', 'Belgium', 'Austria',
       'France', 'Romania', 'Ukraine', 
       'Netherlands', 'Portugal', 'Norway',
       'Czech Republic', 'Greece', 'Switzerland',
       'Denmark','Belarus']

continente4 = ['United States of America','Canada']

continente5 = ['Brazil', 'Argentina','Colombia', 'Mexico', 'Peru', 'Ecuador','Chile']

continente6 = ['Australia']

In [25]:
def continente(pais):
    if pais in continente1:
        return "Asia"
    elif pais in continente2:
        return "África"
    elif pais in continente3:
        return "Europa"
    elif pais in continente4:
        return "América del Norte"
    elif pais in continente5:
        return "América del Sur"
    elif pais in continente6:
        return "Oceanía"

In [26]:
df_social["continente"] = df_social["pais"].apply(continente)

In [27]:
df_social.head()

Unnamed: 0,genero,edad,pais,nivel de estudios,puesto de trabajo,sector,tamaño de la compañia,continente
0,Man,50-54,Indonesia,Master’s degree,Program/Project Manager,Manufacturing/Fabrication,"1000-9,999 employees",Asia
1,Man,22-24,Pakistan,Master’s degree,Software Engineer,Academics/Education,"1000-9,999 employees",Asia
2,Man,45-49,Mexico,Doctoral degree,Research Scientist,Academics/Education,"1000-9,999 employees",América del Sur
3,Man,45-49,India,Doctoral degree,Other,Academics/Education,50-249 employees,Asia
4,Woman,25-29,India,I prefer not to answer,Currently not employed,,,Asia


### Creamos otra columna, para identificar la rama del sector tecnológico en la que se encuentran:

In [28]:
df_social['puesto de trabajo'].unique()

array(['Program/Project Manager', 'Software Engineer',
       'Research Scientist', 'Other', 'Currently not employed', 'Student',
       'Data Scientist', 'Data Analyst', 'Machine Learning Engineer',
       'Business Analyst', 'Data Engineer', 'Product Manager',
       'Statistician', 'Developer Relations/Advocacy',
       'DBA/Database Engineer'], dtype=object)

In [29]:
data = ['Research Scientist','Data Scientist', 'Data Analyst', 'Machine Learning Engineer',
       'Business Analyst', 'Data Engineer','DBA/Database Engineer']

developer =  ['Software Engineer']

other = ['Program/Project Manager', 'Other', 'Currently not employed', 'Student',
       'Statistician', 'Developer Relations/Advocacy']

In [30]:
def puesto_sector(puesto):
    if puesto in data:
        return "data"
    elif puesto in developer:
        return "developer"
    elif puesto in other:
        return "other"

In [31]:
df_social["rama"] = df_social["puesto de trabajo"].apply(puesto_sector)

In [32]:
def nulos(x):
    if x == "":
        return np.nan
    else:
        return x

In [33]:
df_social['sector'] = df_social['sector'].apply(nulos)

In [34]:
df_social.stb.missing()

Unnamed: 0,missing,total,percent
tamaño de la compañia,9722,25972,37.43262
sector,9648,25972,37.147698
continente,1445,25972,5.563684
rama,319,25972,1.228246
genero,0,25972,0.0
edad,0,25972,0.0
pais,0,25972,0.0
nivel de estudios,0,25972,0.0
puesto de trabajo,0,25972,0.0


### Filtramos los datos para no incluir a los estudiantes ni desempleados en nuestro dataframe.

In [35]:
df_social = df_social[(df_social['puesto de trabajo'] != "Student") & (df_social['puesto de trabajo'] != "Other") & (df_social['puesto de trabajo'] != "Currently not employed")]
df_social.head()

Unnamed: 0,genero,edad,pais,nivel de estudios,puesto de trabajo,sector,tamaño de la compañia,continente,rama
0,Man,50-54,Indonesia,Master’s degree,Program/Project Manager,Manufacturing/Fabrication,"1000-9,999 employees",Asia,other
1,Man,22-24,Pakistan,Master’s degree,Software Engineer,Academics/Education,"1000-9,999 employees",Asia,developer
2,Man,45-49,Mexico,Doctoral degree,Research Scientist,Academics/Education,"1000-9,999 employees",América del Sur,data
6,Man,30-34,India,Bachelor’s degree,Data Scientist,Computers/Technology,"10,000 or more employees",Asia,data
12,Man,45-49,Nigeria,Master’s degree,Program/Project Manager,Shipping/Transportation,"1000-9,999 employees",África,other


In [36]:
df_social['puesto de trabajo'].unique()

array(['Program/Project Manager', 'Software Engineer',
       'Research Scientist', 'Data Scientist', 'Data Analyst',
       'Machine Learning Engineer', 'Business Analyst', 'Data Engineer',
       'Product Manager', 'Statistician', 'Developer Relations/Advocacy',
       'DBA/Database Engineer'], dtype=object)

### Filtramos los datos para no incluir cualquier sexo que no sea hombre o mujer.

In [37]:
df_social = df_social[(df_social["genero"]== "Man") | (df_social["genero"]=="Woman")]

In [38]:
df_social['genero'].unique()

array(['Man', 'Woman'], dtype=object)

In [39]:
df_social.head(10)

Unnamed: 0,genero,edad,pais,nivel de estudios,puesto de trabajo,sector,tamaño de la compañia,continente,rama
0,Man,50-54,Indonesia,Master’s degree,Program/Project Manager,Manufacturing/Fabrication,"1000-9,999 employees",Asia,other
1,Man,22-24,Pakistan,Master’s degree,Software Engineer,Academics/Education,"1000-9,999 employees",Asia,developer
2,Man,45-49,Mexico,Doctoral degree,Research Scientist,Academics/Education,"1000-9,999 employees",América del Sur,data
6,Man,30-34,India,Bachelor’s degree,Data Scientist,Computers/Technology,"10,000 or more employees",Asia,data
12,Man,45-49,Nigeria,Master’s degree,Program/Project Manager,Shipping/Transportation,"1000-9,999 employees",África,other
14,Man,35-39,Greece,Doctoral degree,Research Scientist,Academics/Education,50-249 employees,Europa,data
15,Man,50-54,Belgium,Bachelor’s degree,Data Analyst,Energy/Mining,"1000-9,999 employees",Europa,data
16,Man,18-21,Pakistan,Bachelor’s degree,Data Scientist,Academics/Education,0-49 employees,Asia,data
17,Man,22-24,Japan,Master’s degree,Software Engineer,,,Asia,developer
20,Woman,25-29,Turkey,Bachelor’s degree,Data Scientist,Computers/Technology,"10,000 or more employees",Asia,data


In [40]:
df_social.to_csv('../Datos/df-social-limpio.csv')