<a href="https://colab.research.google.com/github/practigol/trabajo_final/blob/main/crear_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Predicción de la pobreza en Argentina**

El objetivo de este trabajo es diseñar un modelo de aprendizaje automático que pueda predecir la pobreza en Argentina, a partir de los datos disponibles en la Encuesta Permanente de Hogares (https://www.indec.gob.ar/indec/web/Institucional-Indec-BasesDeDatos). 

Las bases de datos que se toman en este caso son las del último trimestre del año 2021.




In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import warnings

warnings.filterwarnings('ignore')

# **Creamos el dataset**

In [None]:
#cargamos el archivo de individuos "usu_individual_T421.csv"

from google.colab import files
uploaded  = files.upload() 

Saving usu_individual_T421.csv to usu_individual_T421.csv


In [None]:
df_individual = pd.read_csv("usu_individual_T421.csv")

In [None]:
df_individual.shape

(50154, 177)

In [None]:
# le asignamos un % a cada persona que vive en el hogar según género y edad, en la variable 'AE_ctrol_ind'

condiciones = [
    (df_individual ['CH04'] >= 1) & (df_individual['CH06'] < 1),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 1),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 2),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 3),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 4),
    (df_individual['CH04'] >= 1 ) & ( df_individual['CH06'] == 5),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 6),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 7),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 8),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 9),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 10),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 11),
    (df_individual['CH04'] >= 1) & (df_individual['CH06'] == 12),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 13) & (df_individual['CH06'] <=14),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 15) & (df_individual['CH06'] <=17),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 18) & (df_individual['CH06'] <=29),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 30) & (df_individual['CH06'] <=45),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 46) & (df_individual['CH06'] <=60),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] >= 61) & (df_individual['CH06'] <=75),
    (df_individual['CH04'] == 2) & (df_individual['CH06'] > 75),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==10),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==11),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==12),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==13),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==14),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==15),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==16),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] ==17),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] >= 18) & (df_individual['CH06'] <=29),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] >= 30) & (df_individual['CH06'] <=45),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] >= 46) & (df_individual['CH06'] <=60),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] >= 61) & (df_individual['CH06'] <=75),
    (df_individual['CH04'] == 1) & (df_individual['CH06'] > 75)   
    ]

opciones = [0.35,0.37,0.46,0.51,0.55,0.60,0.64,0.66,0.68,0.69,0.70,0.72,0.74,0.76,0.77,0.76,0.77,0.76,0.67,0.63,0.79,0.82,0.85,0.90,
            0.96,1.00,1.03,1.04,1.02,1.00,1.00,0.83,0.74]



In [None]:
df_individual['AE_ctrol_ind']=np.select(condiciones,opciones)

In [None]:
df_individual.head()

Unnamed: 0,CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,COMPONENTE,H15,REGION,MAS_500,AGLOMERADO,PONDERA,...,ADECIFR,IPCF,DECCFR,IDECCFR,RDECCFR,GDECCFR,PDECCFR,ADECCFR,PONDIH,AE_ctrol_ind
0,TQRMNOPPQHMOLPCDEGNFJ00714305,2021,4,1,4,1,44,N,17,309,...,7,33500.0,6,7.0,5,,7.0,5,411,0.9
1,TQRMNOQPYHKMPUCDEIJAH00663642,2021,4,1,1,1,1,S,33,1573,...,4,32000.0,6,,6,6.0,,6,2285,0.74
2,TQRMNOQPYHKMPUCDEIJAH00663642,2021,4,1,2,1,1,S,33,1573,...,4,32000.0,6,,6,6.0,,6,2285,0.63
3,TQRMNOQYRHMOQRCDEIJAH00718661,2021,4,1,1,1,1,S,33,2635,...,12,0.0,12,,12,12.0,,12,0,1.0
4,TQRMNOPSTHMMLPCDEFMDB00713515,2021,4,1,2,1,43,N,6,87,...,7,43000.0,7,8.0,8,,8.0,8,129,0.67


In [None]:
#seleccionamos solo las columnas que nos interesan para este estudio (ver archivo "EPH_registro_4T2021.pdf")

df_personas = df_individual[['CODUSU','NRO_HOGAR','CH03','CH04','CH06','CH08','NIVEL_ED','CAT_OCUP','CAT_INAC','CH11','AE_ctrol_ind']]

In [None]:
df_personas.head()

Unnamed: 0,CODUSU,NRO_HOGAR,CH03,CH04,CH06,CH08,NIVEL_ED,CAT_OCUP,CAT_INAC,CH11,AE_ctrol_ind
0,TQRMNOPPQHMOLPCDEGNFJ00714305,1,3,1,13,1,3,0,3,1,0.9
1,TQRMNOQPYHKMPUCDEIJAH00663642,1,1,1,79,1,2,0,1,0,0.74
2,TQRMNOQPYHKMPUCDEIJAH00663642,1,2,2,79,1,3,0,1,0,0.63
3,TQRMNOQYRHMOQRCDEIJAH00718661,1,1,1,48,1,4,3,0,0,1.0
4,TQRMNOPSTHMMLPCDEFMDB00713515,1,2,2,65,1,4,0,1,0,0.67


In [None]:
df_personas.to_csv('personas.csv')

In [None]:
#sumamos el % de cada persona que compone el hogar (las variables CODUSU y NRO_HOGAR identifican a cada hogar)

df_suma = df_personas.groupby(['CODUSU','NRO_HOGAR']).AE_ctrol_ind.sum()

In [None]:
df_suma.head()

CODUSU                         NRO_HOGAR
TQRMNOPPQHJLKTCDEGNFJ00699079  1            1.40
TQRMNOPPQHJMKMCDEFKID00667054  1            1.77
TQRMNOPPQHJMKMCDEOJAH00697751  1            0.63
TQRMNOPPQHJMKTCDEHNHB00686535  1            2.19
TQRMNOPPQHJMLPCDEFNFF00670213  1            1.52
Name: AE_ctrol_ind, dtype: float64

In [None]:
df_suma.shape

(17037,)

In [None]:
#unimos los dos dataframes según las variables CODUSU y NRO_HOGAR

df_personas_suma = pd.merge(df_personas, df_suma, on=['CODUSU','NRO_HOGAR'])

In [None]:
df_personas_suma.head()

Unnamed: 0,CODUSU,NRO_HOGAR,CH03,CH04,CH06,CH08,NIVEL_ED,CAT_OCUP,CAT_INAC,CH11,AE_ctrol_ind_x,AE_ctrol_ind_y
0,TQRMNOPPQHMOLPCDEGNFJ00714305,1,3,1,13,1,3,0,3,1,0.9,3.35
1,TQRMNOPPQHMOLPCDEGNFJ00714305,1,1,2,37,1,6,3,0,0,0.77,3.35
2,TQRMNOPPQHMOLPCDEGNFJ00714305,1,2,1,37,1,4,3,0,0,1.0,3.35
3,TQRMNOPPQHMOLPCDEGNFJ00714305,1,3,2,8,1,1,0,3,1,0.68,3.35
4,TQRMNOQPYHKMPUCDEIJAH00663642,1,1,1,79,1,2,0,1,0,0.74,1.37


In [None]:
#renombramos las columnas

df_personas_suma = df_personas_suma.rename(columns={
     'CH03':'parentesco',
     'CH04':'genero',
     'CH06':'edad',
     'CH08':'cob_medica',
     'CH11':'est_pu_pri',
     'AE_ctrol_ind_x':'AE_ctrol_ind',
     'AE_ctrol_ind_y':'AE_ctrol_ind_sum'})

In [None]:
df_personas_suma.head(2)

Unnamed: 0,CODUSU,NRO_HOGAR,parentesco,genero,edad,cob_medica,NIVEL_ED,CAT_OCUP,CAT_INAC,est_pu_pri,AE_ctrol_ind,AE_ctrol_ind_sum
0,TQRMNOPPQHMOLPCDEGNFJ00714305,1,3,1,13,1,3,0,3,1,0.9,3.35
1,TQRMNOPPQHMOLPCDEGNFJ00714305,1,1,2,37,1,6,3,0,0,0.77,3.35


In [None]:
df_personas_suma.shape

(50154, 12)

In [None]:
df_personas_suma.to_csv('personas_suma_rename.csv')

In [None]:
df_personas_suma = pd.read_csv("personas_suma_rename.csv")

In [None]:
#seleccionamos solo los que cumplen la condición de jefes/as de hogar

df_personas_jefes = df_personas_suma.loc[df_personas_suma.loc[:, 'parentesco'] == 1]

In [None]:
df_personas_jefes.head()

Unnamed: 0,CODUSU,NRO_HOGAR,parentesco,genero,edad,cob_medica,NIVEL_ED,CAT_OCUP,CAT_INAC,est_pu_pri,AE_ctrol_ind,AE_ctrol_ind_sum
1,TQRMNOPPQHMOLPCDEGNFJ00714305,1,1,2,37,1,6,3,0,0,0.77,3.35
4,TQRMNOQPYHKMPUCDEIJAH00663642,1,1,1,79,1,2,0,1,0,0.74,1.37
6,TQRMNOQYRHMOQRCDEIJAH00718661,1,1,1,48,1,4,3,0,0,1.0,4.24
12,TQRMNOPSTHMMLPCDEFMDB00713515,1,1,1,67,1,5,0,1,0,0.83,1.5
13,TQRMNOQQSHJMKUCDEHJGH00702130,1,1,2,76,1,6,0,1,0,0.63,1.63


In [None]:
#comprobamos la cantidad de filas

df_personas_jefes.shape

(17037, 12)

In [None]:
df_personas_jefes.to_csv('personas_jefes.csv')

In [None]:
#df_personas_jefes = df_personas_jefes.sort_values('CODUSU',ascending=False)

In [None]:
#df_personas_jefes.head(30)

In [None]:
#df_personas_jefes.tail(30)

In [None]:
#cargamos el archivo de hogares "usu_hogar_T421.csv"

from google.colab import files
uploaded  = files.upload() 

Saving usu_hogar_T421.csv to usu_hogar_T421.csv


In [None]:
df_hogares = pd.read_csv("usu_hogar_T421.csv")

In [None]:
#comprobamos la cantidad de filas

df_hogares.shape

(17037, 88)

In [None]:
#df_hogares = df_hogares.sort_values('CODUSU',ascending=False)

In [None]:
#df_hogares.head(30)

In [None]:
#df_hogares.tail(30)

In [None]:
#unimos los dos dataframes, que tienen la misma cantidad de filas, según las variables CODUSU y NRO_HOGAR

df_dataset = pd.merge(df_hogares, df_personas_jefes, on=['CODUSU','NRO_HOGAR'])

In [None]:
#comprobamos la cantidad de filas

df_dataset.shape

(17037, 98)

In [None]:
df_dataset.head()

Unnamed: 0,CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,...,parentesco,genero,edad,cob_medica,NIVEL_ED,CAT_OCUP,CAT_INAC,est_pu_pri,AE_ctrol_ind,AE_ctrol_ind_sum
0,TQRMNOQRPHMOKOCDEOJAH00714382,2021,4,1,1,44,N,93,116,1,...,1,2,38,1,6,3,0,0,0.77,2.1
1,TQRMNOQYUHMMKPCDEOJAH00714383,2021,4,1,1,44,N,93,46,1,...,1,2,64,1,6,0,1,0,0.67,1.44
2,TQRMNOPSWHMMKRCDEOJAH00714384,2021,4,1,1,44,N,93,43,1,...,1,1,72,1,2,0,1,0,0.83,1.5
3,TQRMNOQYSHMNKTCDEOJAH00714386,2021,4,1,1,44,N,93,64,2,...,1,1,38,1,4,3,0,0,1.0,3.13
4,TQRMNOQXVHLLPQCDEIJAH00718674,2021,4,1,1,1,S,33,2822,1,...,1,1,67,1,4,3,0,0,0.83,3.33


In [None]:
#la variable ITF representa el monto de ingreso total familiar
#a este monto lo dividimos por la suma de % de las personas que residen en el hogar para obtener el ingreso por adulto equivalente

df_dataset['adulto_equivalente']= df_dataset.apply(lambda row: row.ITF/row.AE_ctrol_ind_sum, axis=1)

In [None]:
df_dataset['adulto_equivalente']= round(df_dataset['adulto_equivalente'], 0)

In [None]:
df_dataset.head()

Unnamed: 0,CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,...,genero,edad,cob_medica,NIVEL_ED,CAT_OCUP,CAT_INAC,est_pu_pri,AE_ctrol_ind,AE_ctrol_ind_sum,adulto_equivalente
0,TQRMNOQRPHMOKOCDEOJAH00714382,2021,4,1,1,44,N,93,116,1,...,2,38,1,6,3,0,0,0.77,2.1,0.0
1,TQRMNOQYUHMMKPCDEOJAH00714383,2021,4,1,1,44,N,93,46,1,...,2,64,1,6,0,1,0,0.67,1.44,0.0
2,TQRMNOPSWHMMKRCDEOJAH00714384,2021,4,1,1,44,N,93,43,1,...,1,72,1,2,0,1,0,0.83,1.5,51667.0
3,TQRMNOQYSHMNKTCDEOJAH00714386,2021,4,1,1,44,N,93,64,2,...,1,38,1,4,3,0,0,1.0,3.13,27157.0
4,TQRMNOQXVHLLPQCDEIJAH00718674,2021,4,1,1,1,S,33,2822,1,...,1,67,1,4,3,0,0,0.83,3.33,29090.0


In [None]:
#creamos la variable objetivo, pobreza

"""
De acuerdo con el archivo "canasta_01_22DEF7D4AE32.pdf", la canasta básica total para un adulto equivalente, para que 
no ingrese en la línea de pobreza, es de $24.643.

Creamos la varibale pobreza, donde todos los hogares que tengan ingresos menores a este valor son pobres. 
"""

df_dataset['pobreza'] = np.where(df_dataset['adulto_equivalente']<24643,1,0)


In [None]:
df_dataset['pobreza'].value_counts()

0    9197
1    7840
Name: pobreza, dtype: int64

In [None]:
df_dataset.head()

Unnamed: 0,CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,...,edad,cob_medica,NIVEL_ED,CAT_OCUP,CAT_INAC,est_pu_pri,AE_ctrol_ind,AE_ctrol_ind_sum,adulto_equivalente,pobreza
0,TQRMNOQRPHMOKOCDEOJAH00714382,2021,4,1,1,44,N,93,116,1,...,38,1,6,3,0,0,0.77,2.1,0.0,1
1,TQRMNOQYUHMMKPCDEOJAH00714383,2021,4,1,1,44,N,93,46,1,...,64,1,6,0,1,0,0.67,1.44,0.0,1
2,TQRMNOPSWHMMKRCDEOJAH00714384,2021,4,1,1,44,N,93,43,1,...,72,1,2,0,1,0,0.83,1.5,51667.0,0
3,TQRMNOQYSHMNKTCDEOJAH00714386,2021,4,1,1,44,N,93,64,2,...,38,1,4,3,0,0,1.0,3.13,27157.0,0
4,TQRMNOQXVHLLPQCDEIJAH00718674,2021,4,1,1,1,S,33,2822,1,...,67,1,4,3,0,0,0.83,3.33,29090.0,0


In [None]:
df_dataset.to_csv('dataset.csv')

# **Cleaning**

In [None]:
from google.colab import files
uploaded  = files.upload() 

In [None]:
df_dataset = pd.read_csv("dataset.csv")

In [None]:
df_dataset = df_dataset.drop(['Unnamed: 0'], axis=1)

In [None]:
# vemos los nombres de las columnas
col_names = df_dataset.columns

col_names

Index(['CODUSU', 'ANO4', 'TRIMESTRE', 'NRO_HOGAR', 'REALIZADA', 'REGION',
       'MAS_500', 'AGLOMERADO', 'PONDERA', 'IV1', 'IV1_ESP', 'IV2', 'IV3',
       'IV3_ESP', 'IV4', 'IV5', 'IV6', 'IV7', 'IV7_ESP', 'IV8', 'IV9', 'IV10',
       'IV11', 'IV12_1', 'IV12_2', 'IV12_3', 'II1', 'II2', 'II3', 'II3_1',
       'II4_1', 'II4_2', 'II4_3', 'II5', 'II5_1', 'II6', 'II6_1', 'II7',
       'II7_ESP', 'II8', 'II8_ESP', 'II9', 'V1', 'V2', 'V21', 'V22', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14',
       'V15', 'V16', 'V17', 'V18', 'V19_A', 'V19_B', 'IX_TOT', 'IX_MEN10',
       'IX_MAYEQ10', 'ITF', 'DECIFR', 'IDECIFR', 'RDECIFR', 'GDECIFR',
       'PDECIFR', 'ADECIFR', 'IPCF', 'DECCFR', 'IDECCFR', 'RDECCFR', 'GDECCFR',
       'PDECCFR', 'ADECCFR', 'PONDIH', 'VII1_1', 'VII1_2', 'VII2_1', 'VII2_2',
       'VII2_3', 'VII2_4', 'parentesco', 'genero', 'edad', 'cob_medica',
       'NIVEL_ED', 'CAT_OCUP', 'CAT_INAC', 'est_pu_pri', 'AE_ctrol_ind',
       'AE_ctrol_ind

In [None]:
#la variable ITF representa el monto de ingreso total familiar
#analizamos sus datos

print(round(df_dataset['ITF'].describe()),2)

count      17037.0
mean       68734.0
std        77324.0
min            0.0
25%        22000.0
50%        56000.0
75%        95600.0
max      4035000.0
Name: ITF, dtype: float64 2


In [None]:
ITF_cero = [x for x in df_dataset['ITF'] if x == 0]
len(ITF_cero)

0

In [None]:
#hay 3.582 hogares que declararon ingresos $0. 
#para este estudio no los vamos a tener en cuenta

df_dataset = df_dataset.drop(df_dataset[df_dataset['ITF']==0].index)

In [None]:
ITF_cero = [x for x in df_dataset['ITF'] if x == 0]
len(ITF_cero)

0

In [None]:
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13455 entries, 2 to 17036
Data columns (total 93 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CODUSU              13455 non-null  object 
 1   NRO_HOGAR           13455 non-null  int64  
 2   AGLOMERADO          13455 non-null  int64  
 3   IV1                 13455 non-null  int64  
 4   IV1_ESP             29 non-null     object 
 5   IV2                 13455 non-null  int64  
 6   IV3                 13455 non-null  int64  
 7   IV3_ESP             27 non-null     object 
 8   IV4                 13455 non-null  int64  
 9   IV5                 13455 non-null  int64  
 10  IV6                 13455 non-null  int64  
 11  IV7                 13455 non-null  int64  
 12  IV7_ESP             17 non-null     object 
 13  IV8                 13455 non-null  int64  
 14  IV9                 13455 non-null  int64  
 15  IV10                13455 non-null  int64  
 16  IV11

In [None]:
#de todas las columnas que indican localización dejamos solo Aglomerado
#también quitamos año, trimestre y realizada
#la variable de ponderación
#todos los deciles
#el % individual del jefe/a que reside en el hogar

df_dataset = df_dataset.drop(['ANO4','TRIMESTRE','REALIZADA','REGION','MAS_500','PONDERA','DECIFR','IDECIFR','RDECIFR','GDECIFR','PDECIFR','ADECIFR','DECCFR','IDECCFR','RDECCFR','GDECCFR','PDECCFR','ADECCFR','PONDIH','AE_ctrol_ind'], axis=1)

KeyError: ignored

In [None]:
df_dataset.shape

(13455, 93)

In [None]:
pd.set_option('max_rows', 93)
pd.set_option('max_columns', 93)
pd.set_option('display.width', 93)
pd.set_option('display.max_colwidth', 93)

In [None]:
nulos = df_dataset.isnull().sum()

In [None]:
nulos

CODUSU                    0
NRO_HOGAR                 0
AGLOMERADO                0
IV1                       0
IV1_ESP               13426
IV2                       0
IV3                       0
IV3_ESP               13428
IV4                       0
IV5                       0
IV6                       0
IV7                       0
IV7_ESP               13438
IV8                       0
IV9                       0
IV10                      0
IV11                      0
IV12_1                    0
IV12_2                    0
IV12_3                    0
II1                       0
II2                       0
II3                       0
II3_1                     0
II4_1                     0
II4_2                     0
II4_3                     0
II5                       0
II5_1                     0
II6                       0
II6_1                     0
II7                       0
II7_ESP               13436
II8                       0
II8_ESP               13348
II9                 

In [None]:
# vemos que las variables que tienen valores nulos son 
#IV1_ESP, IV3_ESP, IV7_ESP, II7_ESP, II8_ESP, IDECIFR, GDECIFR, PDECIFR, IDECCFR, GDECCFR, PDECCFR  

In [None]:
df_dataset['IV1_ESP'].notnull().sum()

29

In [None]:
IV1_ESP_no_nulos = [x for x in df_dataset['IV1_ESP'].notnull()]

In [None]:
no_nulos = [var for var in df.columns if df[var].dtype=='O']