# TFM

##1. Configuacion

In [1]:
# Instalación de librerías necesarias
!pip install pyeph > /dev/null 2>&1
!pip install scikit-learn openpyxl > /dev/null 2>&1

In [2]:
# Importación de librerías
import pyeph
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

##2. Extraer bases de datos
Se utiliza la libreria [pyeph](https://pypi.org/project/pyeph/)

In [3]:
# ----------------------------------------------------------------------
# Descarga y carga de datos EPH
# ----------------------------------------------------------------------

bases_individuales = {}
bases_hogar = {}

for ano in range(2023, 2024):
    for trimestre in range(1, 5):
        clave = f"{ano}T{trimestre}"
        try:
            bases_individuales[clave] = pyeph.obtener(data="eph", ano=ano, periodo=trimestre, tipo_base="individual")
            print(f"✓ Base individual {clave} descargada")
        except Exception as e:
            print(f"✗ Error en base individual {clave}: {e}")

        try:
            bases_hogar[clave] = pyeph.obtener(data="eph", ano=ano, periodo=trimestre, tipo_base="hogar")
            print(f"✓ Base hogar {clave} descargada")
        except Exception as e:
            print(f"✗ Error en base hogar {clave}: {e}")

Obtenido con exito: base_individual_2023T1.zip 
✓ Base individual 2023T1 descargada
Obtenido con exito: base_hogar_2023T1.zip 
✓ Base hogar 2023T1 descargada
Obtenido con exito: base_individual_2023T2.zip 
✓ Base individual 2023T2 descargada
Obtenido con exito: base_hogar_2023T2.zip 
✓ Base hogar 2023T2 descargada
Obtenido con exito: base_individual_2023T3.zip 
✓ Base individual 2023T3 descargada
Obtenido con exito: base_hogar_2023T3.zip 
✓ Base hogar 2023T3 descargada
Obtenido con exito: base_individual_2023T4.zip 
✓ Base individual 2023T4 descargada
Obtenido con exito: base_hogar_2023T4.zip 
✓ Base hogar 2023T4 descargada


In [4]:
# ----------------------------------------------------------------------
# Exportar a Excel las bases del 4º trimestre 2023
# ----------------------------------------------------------------------

clave_exportar = "2023T4"

# Exportar base individual
df_individual_export = bases_individuales[clave_exportar].copy()
df_individual_export.to_excel(f"base_individual_{clave_exportar}.xlsx", index=False)

# Exportar base hogar
df_hogar_export = bases_hogar[clave_exportar].copy()
df_hogar_export.to_excel(f"base_hogar_{clave_exportar}.xlsx", index=False)

# Vista previa de la base hogar exportada
df_hogar_export.head()

Unnamed: 0,CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,...,GDECCFR,PDECCFR,ADECCFR,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
0,TQRMNORSQHKMKUCDEIGED00791045,2023,4,2,1,43,N,30,107,1,...,,9.0,7,128,1,0,98,0,0,0
1,TQRMNOPVSHJMLNCDEIGED00801502,2023,4,1,1,43,N,30,137,1,...,,12.0,12,0,1,0,2,0,0,0
2,TQRMNORTYHMLKPCDEHJGH00810920,2023,4,1,1,40,S,23,356,2,...,7.0,,8,329,2,0,1,0,0,0
3,TQRMNOQTWHKLKQCDEHJGH00790284,2023,4,2,1,40,S,23,299,1,...,1.0,,1,320,1,0,2,0,0,0
4,TQRMNORQVHLLKRCDEHJGH00853836,2023,4,1,1,40,S,23,221,1,...,6.0,,8,193,1,0,96,0,0,0


##3. Procesar datos

###3.1 Crear bases

In [5]:
# ----------------------------------------------------------------------
# Procesamiento base HOGAR (4° trimestre 2023)
# ----------------------------------------------------------------------

clave = "2023T4"

variables_hogar_interes = [
    'CODUSU', 'NRO_HOGAR', 'ANO4', 'TRIMESTRE', 'REGION', 'MAS_500', 'AGLOMERADO',
    'PONDERA', 'PONDIH', 'IX_TOT', 'IPCF',
    'IV1', 'IV2', 'IV3', 'IV4', 'IV5', 'IV6', 'IV7', 'IV8', 'IV9', 'IV10', 'IV11',
    'IV12_1', 'IV12_2', 'IV12_3',
    'II1', 'II2', 'II3', 'II4_1', 'II4_2', 'II4_3', 'II7', 'II8', 'II9',
    'V1', 'V2', 'V7', 'V8', 'V9', 'V10', 'V19_A', 'V19_B',
    'VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4'
]

variables_filtradas = [
    'CODUSU', 'PONDERA', 'IV1', 'IV2', 'IV3', 'IV4', 'IV5', 'IV6', 'IV7',
    'IV8', 'IV9', 'IV10', 'IV11', 'IV12_1', 'IV12_2', 'IV12_3',
    'II1', 'II2', 'II3', 'II4_1', 'II4_2', 'II4_3', 'II7', 'II8', 'II9',
    'V1', 'V2', 'V7', 'V8', 'V9', 'V10', 'V19_A', 'V19_B',
    'IX_TOT', 'IPCF', 'PONDIH',
    'VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4'
]

df_hogar = bases_hogar[clave][variables_hogar_interes].copy()
df_hogar = df_hogar[variables_filtradas].copy()

# Corrección decimal en IPCF
df_hogar['IPCF'] = pd.to_numeric(df_hogar['IPCF'].astype(str).str.replace(",", "."), errors='coerce')

# Reemplazo de valores especiales en VII*
variables_vii = ['VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4']
df_hogar[variables_vii] = df_hogar[variables_vii].replace(0, 98)

# Conversión general a numérico (menos CODUSU)
for col in df_hogar.columns:
    if col != 'CODUSU':
        df_hogar[col] = pd.to_numeric(df_hogar[col], errors='coerce')

# Rangos válidos (valores permitidos)
rangos_validos = {
    'IV1': (1, 5), 'IV2': (1, 10), 'IV3': (1, 3), 'IV4': (1, 9), 'IV5': (1, 2),
    'IV6': (1, 3), 'IV7': (1, 3), 'IV8': (1, 2), 'IV9': (1, 3), 'IV10': (1, 3), 'IV11': (1, 4),
    'IV12_1': (1, 2), 'IV12_2': (1, 2), 'IV12_3': (1, 2),
    'II1': (1, 10), 'II2': (1, 10), 'II3': (1, 2), 'II4_1': (1, 2), 'II4_2': (1, 2), 'II4_3': (1, 2),
    'II7': (1, 8), 'II8': (1, 3), 'II9': (1, 4),
    'V1': (1, 2), 'V2': (1, 2), 'V7': (1, 2), 'V8': (1, 2), 'V9': (1, 2), 'V10': (1, 2),
    'V19_A': (1, 2), 'V19_B': (1, 2), 'IX_TOT': (1, 20), 'IPCF': (0, 5000001),
    'VII1_1': (1, 50), 'VII1_2': (1, 50), 'VII2_1': (1, 50), 'VII2_2': (1, 50),
    'VII2_3': (1, 50), 'VII2_4': (1, 50)
}

valores_extra_vii = [96, 97, 98]

# Filtro por rangos válidos
for var, (min_val, max_val) in rangos_validos.items():
    if var in variables_vii:
        df_hogar = df_hogar[df_hogar[var].isin(list(range(min_val, max_val + 1)) + valores_extra_vii)]
    else:
        df_hogar = df_hogar[(df_hogar[var] >= min_val) & (df_hogar[var] <= max_val)]

# Exportar base hogar
df_hogar.to_excel("df_hogar_2023T4.xlsx", index=False)

df_hogar

Unnamed: 0,CODUSU,PONDERA,IV1,IV2,IV3,IV4,IV5,IV6,IV7,IV8,...,V19_B,IX_TOT,IPCF,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
0,TQRMNORSQHKMKUCDEIGED00791045,107,1,4,1,1,1,1,1,1,...,2,3,283333.33,128,1,98,98,98,98,98
1,TQRMNOPVSHJMLNCDEIGED00801502,137,1,3,1,1,2,1,1,1,...,2,2,0.00,0,1,98,2,98,98,98
2,TQRMNORTYHMLKPCDEHJGH00810920,356,2,4,1,9,1,1,1,1,...,2,4,192500.00,329,2,98,1,98,98,98
3,TQRMNOQTWHKLKQCDEHJGH00790284,299,1,6,1,3,1,1,1,1,...,2,3,33333.33,320,1,98,2,98,98,98
4,TQRMNORQVHLLKRCDEHJGH00853836,221,1,5,1,2,1,1,1,1,...,2,2,150000.00,193,1,98,96,98,98,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16456,TQRMNOSXVHKMLMCDEFMDB00787973,180,1,4,1,1,1,1,1,1,...,2,4,30000.00,322,1,98,2,3,4,98
16457,TQRMNOQQYHLOLMCDEFMDB00852214,170,2,3,1,1,1,1,1,1,...,2,2,0.00,0,2,98,98,98,98,98
16458,TQRMNOUQYHLOLNCDEFMDB00852215,249,1,2,1,4,2,1,1,1,...,2,3,0.00,0,1,98,98,98,98,98
16460,TQRMNOSXQHJMKQCDEGKDB00794419,92,1,5,1,4,1,1,1,1,...,2,12,0.00,0,1,7,8,3,98,98


In [6]:
# ----------------------------------------------------------------------
# Procesamiento base INDIVIDUAL (4° trimestre 2023)
# ----------------------------------------------------------------------

variables_individuo = [
    'CODUSU', 'COMPONENTE', 'PONDERA',
    'CH04', 'CH06', 'CH07', 'CH08', 'CH09', 'CH10',
    'CH11', 'CH12', 'CH13', 'NIVEL_ED', 'ESTADO', 'CAT_OCUP', 'CAT_INAC'
]

df_ind = bases_individuales[clave][variables_individuo].copy()
df_ind['CH11'] = df_ind['CH11'].replace(0, 9)
df_ind['CH12'] = df_ind['CH12'].replace(0, 1)

rangos_validos_ind = {
    'COMPONENTE': (1, 20), 'CH04': (1, 2), 'CH06': (0, 105),
    'CH07': (1, 5), 'CH08': [1, 2, 3, 4, 9, 12, 13, 23, 123],
    'CH09': (1, 3), 'CH10': (1, 3), 'CH11': [1, 2, 9],
    'CH12': (1, 9), 'CH13': (0, 2), 'NIVEL_ED': (1, 7),
    'ESTADO': (1, 4), 'CAT_OCUP': (0, 4), 'CAT_INAC': (0, 7)
}

for var, criterio in rangos_validos_ind.items():
    if isinstance(criterio, tuple):
        df_ind = df_ind[(df_ind[var] >= criterio[0]) & (df_ind[var] <= criterio[1])]
    elif isinstance(criterio, list):
        df_ind = df_ind[df_ind[var].isin(criterio)]

# ----------------------------------------------------------------------
# Cálculo de variables educativas: EDA_ESC, EDA_ESP, IEAE
# ----------------------------------------------------------------------

# Años de escolaridad alcanzados según NIVEL_ED
mapa_eda_esc = {
    7: 5,
    1: 8.5,
    2: 12,
    3: 14.5,
    4: 17,
    5: 19,
    6: 21
}
df_ind['EDA_ESC'] = df_ind['NIVEL_ED'].map(mapa_eda_esc)

# Años de escolaridad esperados según edad (CH06)
df_ind['EDA_ESP'] = np.select(
    [
        df_ind['CH06'] <= 5,
        df_ind['CH06'] >= 22
    ],
    [
        5,
        22
    ],
    default=df_ind['CH06']
)

# Índice Educativo Ajustado por Edad
df_ind['IEAE'] = df_ind['EDA_ESC'] / df_ind['EDA_ESP']
df_ind['IEAE'] = df_ind['IEAE'].clip(upper=1)

# Exportar base hogar

df_ind.to_excel("df_individual_2023T4.xlsx", index=False)

df_ind

Unnamed: 0,CODUSU,COMPONENTE,PONDERA,CH04,CH06,CH07,CH08,CH09,CH10,CH11,CH12,CH13,NIVEL_ED,ESTADO,CAT_OCUP,CAT_INAC,EDA_ESC,EDA_ESP,IEAE
0,TQRMNOSUWHLMKQCDEFNFF00852219,1,370,1,60,2,1,1,2,9,7,1,6,1,3,0,21.0,22,0.954545
1,TQRMNOSUWHLMKQCDEFNFF00852219,2,370,2,52,2,1,1,2,9,4,1,4,3,0,4,17.0,22,0.772727
2,TQRMNOTXQHKOLMCDEGNFJ00794590,1,281,1,34,1,1,1,2,9,4,1,4,1,3,0,17.0,22,0.772727
3,TQRMNOTXQHKOLMCDEGNFJ00794590,2,281,2,32,1,1,1,2,9,4,1,4,1,3,0,17.0,22,0.772727
4,TQRMNOTXQHKOLMCDEGNFJ00794590,3,281,2,6,5,1,1,1,1,2,2,1,4,0,3,8.5,6,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47332,TQRMNORXTHLOLSCDEGJBF00853031,1,634,1,80,1,1,2,2,9,2,2,1,3,0,7,8.5,22,0.386364
47333,TQRMNORXTHLOLSCDEGJBF00853031,2,634,2,47,5,1,1,2,9,4,1,4,1,3,0,17.0,22,0.772727
47334,TQRMNORXTHLOLSCDEGJBF00853031,3,634,1,42,5,4,1,2,9,2,2,1,1,2,0,8.5,22,0.386364
47335,TQRMNORXTHLOLSCDEGJBF00853031,4,634,2,58,1,4,1,2,9,2,2,1,3,0,7,8.5,22,0.386364


###3.2 Conformacion base de interes

In [7]:
# ----------------------------------------------------------------------
# Construcción de base final para estimaciones (hogar + indicadores)
# ----------------------------------------------------------------------

# 1. Hacinamiento: IX_TOT / II2
df_hogar['HAC'] = df_hogar['IX_TOT'] / df_hogar['II2']

# 2-7. Indicadores a nivel hogar desde base individual
df_ind_agg = df_ind.groupby('CODUSU').agg(
    EDA_sum=('CH06', 'sum'),
    FEM_sum=('CH04', lambda x: (x == 2).sum()),
    ANALFABET_sum=('CH09', lambda x: (x == 2).sum()),
    DESOCUP_sum=('ESTADO', lambda x: (x == 2).sum()),
    INACTIVOS_sum=('ESTADO', lambda x: (x == 3).sum()),
    IEAE_sum=('IEAE', 'sum')
).reset_index()

# Unir con base hogar
df_modelo = df_hogar.merge(df_ind_agg, on='CODUSU', how='inner')

# Calcular proporciones normalizadas por IX_TOT
df_modelo['EDA'] = df_modelo['EDA_sum'] / df_modelo['IX_TOT']
df_modelo['FEM'] = df_modelo['FEM_sum'] / df_modelo['IX_TOT']
df_modelo['ANALFABET'] = df_modelo['ANALFABET_sum'] / df_modelo['IX_TOT']
df_modelo['DESOCUP'] = df_modelo['DESOCUP_sum'] / df_modelo['IX_TOT']
df_modelo['INACTIVOS'] = df_modelo['INACTIVOS_sum'] / df_modelo['IX_TOT']
df_modelo['NE'] = df_modelo['IEAE_sum'] / df_modelo['IX_TOT']

# Exportar base final
df_modelo.to_excel("df_modelo_estimaciones_2023T4.xlsx", index=False)

# Vista previa
df_modelo

Unnamed: 0,CODUSU,PONDERA,IV1,IV2,IV3,IV4,IV5,IV6,IV7,IV8,...,ANALFABET_sum,DESOCUP_sum,INACTIVOS_sum,IEAE_sum,EDA,FEM,ANALFABET,DESOCUP,INACTIVOS,NE
0,TQRMNORSQHKMKUCDEIGED00791045,107,1,4,1,1,1,1,1,1,...,0,0,1,2.954545,18.666667,1.000000,0.000000,0.000000,0.333333,0.984848
1,TQRMNOPVSHJMLNCDEIGED00801502,137,1,3,1,1,2,1,1,1,...,0,0,2,1.727273,67.500000,0.500000,0.000000,0.000000,1.000000,0.863636
2,TQRMNORTYHMLKPCDEHJGH00810920,356,2,4,1,9,1,1,1,1,...,0,0,0,2.762626,21.250000,0.500000,0.000000,0.000000,0.000000,0.690657
3,TQRMNOQTWHKLKQCDEHJGH00790284,299,1,6,1,3,1,1,1,1,...,0,1,0,1.818182,26.666667,0.333333,0.000000,0.333333,0.000000,0.606061
4,TQRMNORQVHLLKRCDEHJGH00853836,221,1,5,1,2,1,1,1,1,...,0,0,1,1.860795,31.500000,1.000000,0.000000,0.000000,0.500000,0.930398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16033,TQRMNOSXVHKMLMCDEFMDB00787973,180,1,4,1,1,1,1,1,1,...,0,2,2,3.386364,31.750000,0.750000,0.000000,0.500000,0.500000,0.846591
16034,TQRMNOQQYHLOLMCDEFMDB00852214,170,2,3,1,1,1,1,1,1,...,0,0,1,1.090909,73.500000,0.500000,0.000000,0.000000,0.500000,0.545455
16035,TQRMNOUQYHLOLNCDEFMDB00852215,249,1,2,1,4,2,1,1,1,...,0,0,0,2.213203,43.666667,0.666667,0.000000,0.000000,0.000000,0.737734
16036,TQRMNOSXQHJMKQCDEGKDB00794419,92,1,5,1,4,1,1,1,1,...,1,0,4,8.805240,27.916667,0.500000,0.083333,0.000000,0.333333,0.733770


##4. Regresion
$${NE}_i = f(X_i) + e_i$$

###4.1 MCO con seleccion de variables

###4.2 Bosques aleatorios

###4.3 Support Vector Regressor (SVR)

##5. Clasificacion
$$PE_i = \mathbb{I}({NE}_i<\tau)$$

$\tau$: Mediana

###5.1 Bosques aleatorios

###5.2 Support Vector Machine (SVM)