In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer

In [2]:
ruta =  os.path.dirname(os.getcwd())
print(ruta)

/Users/macbookair/Documents/GitHub/predicting_poverty_bdmc


In [4]:
data = pd.read_csv(ruta+'/stores/base_completa.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,Clase,Dominio,cuartos,dormitorio,tipo_vivienda,nper,nper_ugasto,l_indigencia,...,trabajo.formal,pension,subsidio.alimenticio,subsidio.transporte,subsidio.familiar,subsidio.educativo,numero.menores.edad,individuos.hogar,proporcion.menores.edad,numero.tercera.edad
0,1,2a7ddc2779480d7f19834953,1,SANTA MARTA,4,3,1,5,5,121449.452925,...,0,0,0,0,0,0,0,5,0.0,1
1,2,a0c2e751e582fd49d564f308,1,SANTA MARTA,4,3,4,6,6,121449.452925,...,1,0,0,1,1,0,2,6,0.333333,1
2,3,57273d19e8464a5ff66a582b,2,RURAL,3,1,1,2,2,100763.337626,...,1,0,0,0,0,0,0,2,0.0,0
3,4,418d052ff7878940ab938601,1,MEDELLIN,4,3,1,5,5,122251.781628,...,1,0,0,1,1,0,0,5,0.0,1
4,5,212a37fc17016a3c78f76852,1,MEDELLIN,5,2,2,2,2,123664.359813,...,1,0,0,0,0,0,0,2,0.0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231128 entries, 0 to 231127
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               231128 non-null  int64  
 1   id                       231128 non-null  object 
 2   Clase                    231128 non-null  int64  
 3   Dominio                  231128 non-null  object 
 4   cuartos                  231128 non-null  int64  
 5   dormitorio               231128 non-null  int64  
 6   tipo_vivienda            231128 non-null  int64  
 7   nper                     231128 non-null  int64  
 8   nper_ugasto              231128 non-null  int64  
 9   l_indigencia             231128 non-null  float64
 10  l_pobreza                231128 non-null  float64
 11  pobre                    164960 non-null  float64
 12  ingtotug                 164960 non-null  float64
 13  sample                   231128 non-null  object 
 14  nive

La linea de pobreza para el 2022 es de $396.864 pesos por persona según el DANE. [Enlace](https://www.dane.gov.co/index.php/estadisticas-por-tema/pobreza-y-condiciones-de-vida/pobreza-monetaria)tr

In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66168 entries, 0 to 66167
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               66168 non-null  int64  
 1   id                       66168 non-null  object 
 2   Clase                    66168 non-null  int64  
 3   Dominio                  66168 non-null  object 
 4   cuartos                  66168 non-null  int64  
 5   dormitorio               66168 non-null  int64  
 6   tipo_vivienda            66168 non-null  int64  
 7   nper                     66168 non-null  int64  
 8   nper_ugasto              66168 non-null  int64  
 9   l_indigencia             66168 non-null  float64
 10  l_pobreza                66168 non-null  float64
 11  pobre                    0 non-null      float64
 12  ingtotug                 0 non-null      float64
 13  sample                   66168 non-null  object 
 14  nivel.educacion          66

In [30]:
# Crear las bases train y test
train_data = data[data['sample'] == 'train']
test_data = data[data['sample'] == 'test']

# Seleccionar variables
vars = ['Clase', 'nper', 'l_indigencia', 'l_pobreza', 'pension',
        'nivel.educacion', 'trabajo.formal', 'proporcion.menores.edad']

# Definir el modelo
adaboost_reg = AdaBoostRegressor(random_state = 123)

# Crear flujo de trabajo con preprocesamiento y clasificador
adaboost_workflow = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('classifier', adaboost_reg)
])

# Definir la grilla de hiperparámetros
param_grid = {
    'classifier__n_estimators': [50, 80],
    'classifier__learning_rate': [0.01, 0.1, 0.5]
}

# F1 es la función de error que usamos en la grilla
f1_scorer = make_scorer(f1_score)

# Grilla
grid_search = GridSearchCV(adaboost_workflow, param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_search.fit(train[vars], train['ingtotug'])

# Obtener los mejores parámetros
best_params = grid_search.best_params_

# Extraer los parámetros específicos de AdaBoostRegressor
adaboost_params = {k.split('__')[1]: v for k, v in best_params.items()}

# Añadir el parámetro random_state
adaboost_params['random_state'] = 123

# Instanciar AdaBoostRegressor con los mejores parámetros
adaboost_reg_best = AdaBoostRegressor(**adaboost_params)

# Entrenar el modelo con los mejores parámetros
adaboost_reg_best.fit(train[vars], train['ingtotug'])

In [31]:
threshold = test_data['l_pobreza']

# Predecir ingresos en datos de prueba
test_preds = adaboost_reg_best.predict(test_data[vars])

# Añadir las predicciones al conjunto de datos de prueba
test_data['Ingtot_Pred'] = test_preds
test_data['pobre'] = np.where(test_preds <= threshold, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Ingtot_Pred'] = test_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pobre'] = np.where(test_preds <= threshold, 1, 0)


In [32]:
adaboost = test_data[['id', 'pobre']].copy()
adaboost.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66168 entries, 0 to 66167
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      66168 non-null  object
 1   pobre   66168 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [33]:
adaboost.to_csv(ruta + '/templates/adaboost_train.csv', index = False)