# DATAFRAME TÉCNICO - Limpieza

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Una vez explorado nuestro DataFrame, con las columnas ya seleccionadas, hacemos la limpieza de datos oportuna.

In [2]:
df_tecnico = pd.read_csv('../Datos/df-tecnico-sucio.csv', index_col=0)

In [3]:
df_tecnico.head()

Unnamed: 0,edad,pais,nivel de estudios,sector,algoritmos ML,años de experiencia,años de experiencia en ML,lenguaje mas usado
0,25-29,Poland,Master’s degree,Computers/Technology,"NULL, Decision Trees or Random Forests, Gradie...",3-5 years,Under 1 year,"Python, C++"
1,30-34,Viet Nam,Master’s degree,Academics/Education,"NULL, Dense Neural Networks (MLPs, etc), Convo...",1-3 years,1-2 years,Python
2,22-24,Morocco,Master’s degree,Computers/Technology,"NULL, None",1-3 years,Under 1 year,"Python, SQL, MATLAB, Other"
3,18-21,Bangladesh,Bachelor’s degree,Non-profit/Service,"Linear or Logistic Regression, Decision Trees ...",under 1 years,Under 1 year,Python
4,18-21,India,Bachelor’s degree,Academics/Education,"Linear or Logistic Regression, Decision Trees ...",1-3 years,Under 1 year,"Python, SQL, C, C++"


### Creamos una nueva columna con los continentes usando "apply()".

In [4]:
continente1 = ['India', 'Turkey','Viet Nam', 'Bangladesh', 'Indonesia', 
        'Singapore', 'Taiwan', 'Hong Kong (S.A.R.)', 'Israel', 'Pakistan', 'Philippines', 'United Arab Emirates', 'Thailand', 'Japan', 'China', 'South Korea', 
       'Saudi Arabia', 'Malaysia', 'Kazakhstan', 'Nepal']

continente2 = ['Iran, Islamic Republic of...', 'Egypt','Morocco','Tunisia','Nigeria','Kenya','South Africa',
       'Algeria','Ghana','Uganda', 'Ethiopia', 'Iraq']

continente3 = ['Poland','Russia', 'Italy', 'Sweden',
       'Ireland', 'United Kingdom of Great Britain and Northern Ireland','Spain',
       'Germany', 'Belgium', 'Austria',
       'France', 'Romania', 'Ukraine', 
       'Netherlands', 'Portugal', 'Norway',
       'Czech Republic', 'Greece', 'Switzerland',
       'Denmark','Belarus']

continente4 = ['United States of America','Canada']

continente5 = ['Brazil', 'Argentina','Colombia', 'Mexico', 'Peru', 'Ecuador','Chile']

continente6 = ['Australia']

In [5]:
def continente(pais):
    if pais in continente1:
        return "Asia"
    elif pais in continente2:
        return "África"
    elif pais in continente3:
        return "Europa"
    elif pais in continente4:
        return "América del Norte"
    elif pais in continente5:
        return "América del Sur"
    elif pais in continente6:
        return "Oceanía"

In [6]:
df_tecnico["continente"] = df_tecnico["pais"].apply(continente)

In [7]:
df_tecnico.head()

Unnamed: 0,edad,pais,nivel de estudios,sector,algoritmos ML,años de experiencia,años de experiencia en ML,lenguaje mas usado,continente
0,25-29,Poland,Master’s degree,Computers/Technology,"NULL, Decision Trees or Random Forests, Gradie...",3-5 years,Under 1 year,"Python, C++",Europa
1,30-34,Viet Nam,Master’s degree,Academics/Education,"NULL, Dense Neural Networks (MLPs, etc), Convo...",1-3 years,1-2 years,Python,Asia
2,22-24,Morocco,Master’s degree,Computers/Technology,"NULL, None",1-3 years,Under 1 year,"Python, SQL, MATLAB, Other",África
3,18-21,Bangladesh,Bachelor’s degree,Non-profit/Service,"Linear or Logistic Regression, Decision Trees ...",under 1 years,Under 1 year,Python,Asia
4,18-21,India,Bachelor’s degree,Academics/Education,"Linear or Logistic Regression, Decision Trees ...",1-3 years,Under 1 year,"Python, SQL, C, C++",Asia


### Eliminamos los nulos en la columna del lenguaje más usado.

In [8]:
df_tecnico = df_tecnico[df_tecnico['lenguaje mas usado'] != ""]
df_tecnico.head()

Unnamed: 0,edad,pais,nivel de estudios,sector,algoritmos ML,años de experiencia,años de experiencia en ML,lenguaje mas usado,continente
0,25-29,Poland,Master’s degree,Computers/Technology,"NULL, Decision Trees or Random Forests, Gradie...",3-5 years,Under 1 year,"Python, C++",Europa
1,30-34,Viet Nam,Master’s degree,Academics/Education,"NULL, Dense Neural Networks (MLPs, etc), Convo...",1-3 years,1-2 years,Python,Asia
2,22-24,Morocco,Master’s degree,Computers/Technology,"NULL, None",1-3 years,Under 1 year,"Python, SQL, MATLAB, Other",África
3,18-21,Bangladesh,Bachelor’s degree,Non-profit/Service,"Linear or Logistic Regression, Decision Trees ...",under 1 years,Under 1 year,Python,Asia
4,18-21,India,Bachelor’s degree,Academics/Education,"Linear or Logistic Regression, Decision Trees ...",1-3 years,Under 1 year,"Python, SQL, C, C++",Asia


## PRUEBAS

In [9]:
df_lenguaje = df_tecnico["lenguaje mas usado"].str.split(",", expand= True).reset_index()
df_lenguaje

Unnamed: 0,index,0,1,2,3,4,5,6,7,8
0,0,Python,C++,,,,,,,
1,1,Python,,,,,,,,
2,2,Python,SQL,MATLAB,Other,,,,,
3,3,Python,,,,,,,,
4,4,Python,SQL,C,C++,,,,,
...,...,...,...,...,...,...,...,...,...,...
174,174,Python,C,Java,Javascript,MATLAB,,,,
175,175,Python,R,C,C++,,,,,
176,176,,C,MATLAB,,,,,,
177,177,Python,C,Java,,,,,,


In [10]:
df_tecnico["lenguaje"] = df_tecnico["lenguaje mas usado"].str.split(",")
df_tecnico.head(2)

Unnamed: 0,edad,pais,nivel de estudios,sector,algoritmos ML,años de experiencia,años de experiencia en ML,lenguaje mas usado,continente,lenguaje
0,25-29,Poland,Master’s degree,Computers/Technology,"NULL, Decision Trees or Random Forests, Gradie...",3-5 years,Under 1 year,"Python, C++",Europa,"[Python, C++]"
1,30-34,Viet Nam,Master’s degree,Academics/Education,"NULL, Dense Neural Networks (MLPs, etc), Convo...",1-3 years,1-2 years,Python,Asia,[Python]


### Creamos una función para separar en columnas los lenguajes mas usados.

In [11]:
for indice, row in df_tecnico.iterrows():
    try:
        for i in row["lenguaje"]:
            if i not in df_tecnico.columns:
                df_tecnico[i] = "N"
        df_tecnico.loc[indice,i] = "Y"
    except:
        pass
    

In [12]:
df_tecnico.head()

Unnamed: 0,edad,pais,nivel de estudios,sector,algoritmos ML,años de experiencia,años de experiencia en ML,lenguaje mas usado,continente,lenguaje,...,MATLAB,Other,C,Java,Javascript,R,Swift,Bash,NULL,Julia
0,25-29,Poland,Master’s degree,Computers/Technology,"NULL, Decision Trees or Random Forests, Gradie...",3-5 years,Under 1 year,"Python, C++",Europa,"[Python, C++]",...,N,N,N,N,N,N,N,N,N,N
1,30-34,Viet Nam,Master’s degree,Academics/Education,"NULL, Dense Neural Networks (MLPs, etc), Convo...",1-3 years,1-2 years,Python,Asia,[Python],...,N,N,N,N,N,N,N,N,N,N
2,22-24,Morocco,Master’s degree,Computers/Technology,"NULL, None",1-3 years,Under 1 year,"Python, SQL, MATLAB, Other",África,"[Python, SQL, MATLAB, Other]",...,N,Y,N,N,N,N,N,N,N,N
3,18-21,Bangladesh,Bachelor’s degree,Non-profit/Service,"Linear or Logistic Regression, Decision Trees ...",under 1 years,Under 1 year,Python,Asia,[Python],...,N,N,N,N,N,N,N,N,N,N
4,18-21,India,Bachelor’s degree,Academics/Education,"Linear or Logistic Regression, Decision Trees ...",1-3 years,Under 1 year,"Python, SQL, C, C++",Asia,"[Python, SQL, C, C++]",...,N,N,N,N,N,N,N,N,N,N


In [13]:
df_lenguajes = df_tecnico["lenguaje mas usado"].str.split(",", n=3, expand=True)

In [14]:
df_tecnico2 = df_tecnico.copy()

In [15]:
df_tecnico2 = df_tecnico2.join(df_lenguajes, how = 'inner')

In [16]:
def lenguajes2():
   for i in lista_l:
      if i == "Python":
         return "Python"
      elif i == "SQL":
         return "SQL"

In [17]:
def lenguajes(x):
    for i in df_lenguajes.columns:
        if np.any(df_lenguajes[i] == "Python"):
            return "Python"
        elif np.any(df_lenguajes[i] == "SQL"):
            return "SQL"
        else:
            return "Others"

In [18]:
lenguajes("Python")

'Python'

In [19]:
df_tecnico["lenguajes"] = df_tecnico["lenguaje mas usado"].apply(lenguajes2())

NameError: name 'lista_l' is not defined

In [None]:
# Investigar acumulativos

In [None]:
print(df_lenguaje[0].value_counts())
print ("---------------------------------")
print(df_lenguaje[1].value_counts())
print ("---------------------------------")
print(df_lenguaje[2].value_counts())
print ("---------------------------------")
print(df_lenguaje[3].value_counts())
print ("---------------------------------")
print(df_lenguaje[4].value_counts())
print ("---------------------------------")
print(df_lenguaje[5].value_counts())
print ("---------------------------------")
print(df_lenguaje[6].value_counts())
print ("---------------------------------")
print(df_lenguaje[7].value_counts())
print ("---------------------------------")
print(df_lenguaje[8].value_counts())

Python    161
NULL        8
Name: 0, dtype: int64
---------------------------------
 SQL           48
 C++           19
 C             16
 R             16
 Java          10
 Javascript     5
 MATLAB         5
 Other          2
 Bash           1
Name: 1, dtype: int64
---------------------------------
 C             15
 Java          14
 C++           11
 MATLAB        10
 Javascript     6
 SQL            6
 Bash           5
 Swift          1
 Other          1
 Julia          1
Name: 2, dtype: int64
---------------------------------
 C++           10
 Java          10
 MATLAB         6
 Javascript     3
 Bash           3
 Other          2
 C              1
Name: 3, dtype: int64
---------------------------------
 MATLAB        6
 Java          4
 Javascript    3
 Other         1
 C++           1
Name: 4, dtype: int64
---------------------------------
 MATLAB        3
 Javascript    2
 Java          1
 Other         1
Name: 5, dtype: int64
---------------------------------
 MATLAB        

In [None]:
df_tecnico.to_csv('../Datos/df-tecnico-limpio.csv')