# TFM

##1. Configuacion

In [1]:
# Instalación de librerías necesarias
!pip install pyeph > /dev/null 2>&1
!pip install scikit-learn openpyxl > /dev/null 2>&1

In [2]:
# Importación de librerías
import pyeph
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

##2. Extraer bases de datos
Se utiliza la libreria [pyeph](https://pypi.org/project/pyeph/)

In [3]:
# ----------------------------------------------------------------------
# Descarga y carga de datos EPH
# ----------------------------------------------------------------------

bases_individuales = {}
bases_hogar = {}

for ano in range(2023, 2024):
    for trimestre in range(1, 5):
        clave = f"{ano}T{trimestre}"
        try:
            bases_individuales[clave] = pyeph.obtener(data="eph", ano=ano, periodo=trimestre, tipo_base="individual")
            print(f"✓ Base individual {clave} descargada")
        except Exception as e:
            print(f"✗ Error en base individual {clave}: {e}")

        try:
            bases_hogar[clave] = pyeph.obtener(data="eph", ano=ano, periodo=trimestre, tipo_base="hogar")
            print(f"✓ Base hogar {clave} descargada")
        except Exception as e:
            print(f"✗ Error en base hogar {clave}: {e}")

Obtenido con exito: base_individual_2023T1.zip 
✓ Base individual 2023T1 descargada
Obtenido con exito: base_hogar_2023T1.zip 
✓ Base hogar 2023T1 descargada
Obtenido con exito: base_individual_2023T2.zip 
✓ Base individual 2023T2 descargada
Obtenido con exito: base_hogar_2023T2.zip 
✓ Base hogar 2023T2 descargada
Obtenido con exito: base_individual_2023T3.zip 
✓ Base individual 2023T3 descargada
Obtenido con exito: base_hogar_2023T3.zip 
✓ Base hogar 2023T3 descargada
Obtenido con exito: base_individual_2023T4.zip 
✓ Base individual 2023T4 descargada
Obtenido con exito: base_hogar_2023T4.zip 
✓ Base hogar 2023T4 descargada


In [4]:
# ----------------------------------------------------------------------
# Exportar a Excel las bases del 4º trimestre 2023
# ----------------------------------------------------------------------

clave_exportar = "2023T4"

# Exportar base individual
df_individual_export = bases_individuales[clave_exportar].copy()
df_individual_export.to_excel(f"base_individual_{clave_exportar}.xlsx", index=False)

# Exportar base hogar
df_hogar_export = bases_hogar[clave_exportar].copy()
df_hogar_export.to_excel(f"base_hogar_{clave_exportar}.xlsx", index=False)

# Vista previa de la base hogar exportada
df_hogar_export.head()

##3. Procesar datos

###3.1 Crear bases

In [4]:
# ----------------------------------------------------------------------
# Procesamiento base HOGAR (4° trimestre 2023)
# ----------------------------------------------------------------------

clave = "2023T4"

variables_hogar_interes = [
    'CODUSU', 'NRO_HOGAR', 'ANO4', 'TRIMESTRE', 'REGION', 'MAS_500', 'AGLOMERADO',
    'PONDERA', 'PONDIH', 'IX_TOT', 'IPCF',
    'IV1', 'IV2', 'IV3', 'IV4', 'IV5', 'IV6', 'IV7', 'IV8', 'IV9', 'IV10', 'IV11',
    'IV12_1', 'IV12_2', 'IV12_3',
    'II1', 'II2', 'II3', 'II4_1', 'II4_2', 'II4_3', 'II7', 'II8', 'II9',
    'V1', 'V2', 'V7', 'V8', 'V9', 'V10', 'V19_A', 'V19_B',
    'VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4'
]

variables_filtradas = [
    'CODUSU', 'PONDERA', 'IV1', 'IV2', 'IV3', 'IV4', 'IV5', 'IV6', 'IV7',
    'IV8', 'IV9', 'IV10', 'IV11', 'IV12_1', 'IV12_2', 'IV12_3',
    'II1', 'II2', 'II3', 'II4_1', 'II4_2', 'II4_3', 'II7', 'II8', 'II9',
    'V1', 'V2', 'V7', 'V8', 'V9', 'V10', 'V19_A', 'V19_B',
    'IX_TOT', 'IPCF', 'PONDIH',
    'VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4'
]

df_hogar = bases_hogar[clave][variables_hogar_interes].copy()
df_hogar = df_hogar[variables_filtradas].copy()

# Corrección decimal en IPCF
df_hogar['IPCF'] = pd.to_numeric(df_hogar['IPCF'].astype(str).str.replace(",", "."), errors='coerce')

# Reemplazo de valores especiales en VII*
variables_vii = ['VII1_1', 'VII1_2', 'VII2_1', 'VII2_2', 'VII2_3', 'VII2_4']
df_hogar[variables_vii] = df_hogar[variables_vii].replace(0, 98)

# Conversión general a numérico (menos CODUSU)
for col in df_hogar.columns:
    if col != 'CODUSU':
        df_hogar[col] = pd.to_numeric(df_hogar[col], errors='coerce')

# Rangos válidos (valores permitidos)
rangos_validos = {
    'IV1': (1, 5), 'IV2': (1, 10), 'IV3': (1, 3), 'IV4': (1, 9), 'IV5': (1, 2),
    'IV6': (1, 3), 'IV7': (1, 3), 'IV8': (1, 2), 'IV9': (1, 3), 'IV10': (1, 3), 'IV11': (1, 4),
    'IV12_1': (1, 2), 'IV12_2': (1, 2), 'IV12_3': (1, 2),
    'II1': (1, 10), 'II2': (1, 10), 'II3': (1, 2), 'II4_1': (1, 2), 'II4_2': (1, 2), 'II4_3': (1, 2),
    'II7': (1, 8), 'II8': (1, 3), 'II9': (1, 4),
    'V1': (1, 2), 'V2': (1, 2), 'V7': (1, 2), 'V8': (1, 2), 'V9': (1, 2), 'V10': (1, 2),
    'V19_A': (1, 2), 'V19_B': (1, 2), 'IX_TOT': (1, 20), 'IPCF': (0, 5000001),
    'VII1_1': (1, 50), 'VII1_2': (1, 50), 'VII2_1': (1, 50), 'VII2_2': (1, 50),
    'VII2_3': (1, 50), 'VII2_4': (1, 50)
}

valores_extra_vii = [96, 97, 98]

# Filtro por rangos válidos
for var, (min_val, max_val) in rangos_validos.items():
    if var in variables_vii:
        df_hogar = df_hogar[df_hogar[var].isin(list(range(min_val, max_val + 1)) + valores_extra_vii)]
    else:
        df_hogar = df_hogar[(df_hogar[var] >= min_val) & (df_hogar[var] <= max_val)]

df_hogar

Unnamed: 0,CODUSU,PONDERA,IV1,IV2,IV3,IV4,IV5,IV6,IV7,IV8,...,V19_B,IX_TOT,IPCF,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
0,TQRMNORSQHKMKUCDEIGED00791045,107,1,4,1,1,1,1,1,1,...,2,3,283333.33,128,1,98,98,98,98,98
1,TQRMNOPVSHJMLNCDEIGED00801502,137,1,3,1,1,2,1,1,1,...,2,2,0.00,0,1,98,2,98,98,98
2,TQRMNORTYHMLKPCDEHJGH00810920,356,2,4,1,9,1,1,1,1,...,2,4,192500.00,329,2,98,1,98,98,98
3,TQRMNOQTWHKLKQCDEHJGH00790284,299,1,6,1,3,1,1,1,1,...,2,3,33333.33,320,1,98,2,98,98,98
4,TQRMNORQVHLLKRCDEHJGH00853836,221,1,5,1,2,1,1,1,1,...,2,2,150000.00,193,1,98,96,98,98,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16456,TQRMNOSXVHKMLMCDEFMDB00787973,180,1,4,1,1,1,1,1,1,...,2,4,30000.00,322,1,98,2,3,4,98
16457,TQRMNOQQYHLOLMCDEFMDB00852214,170,2,3,1,1,1,1,1,1,...,2,2,0.00,0,2,98,98,98,98,98
16458,TQRMNOUQYHLOLNCDEFMDB00852215,249,1,2,1,4,2,1,1,1,...,2,3,0.00,0,1,98,98,98,98,98
16460,TQRMNOSXQHJMKQCDEGKDB00794419,92,1,5,1,4,1,1,1,1,...,2,12,0.00,0,1,7,8,3,98,98


In [5]:
# ----------------------------------------------------------------------
# Procesamiento base INDIVIDUAL (4° trimestre 2023)
# ----------------------------------------------------------------------

variables_individuo = [
    'CODUSU', 'COMPONENTE', 'PONDERA',
    'CH04', 'CH06', 'CH07', 'CH08', 'CH09', 'CH10',
    'CH11', 'CH12', 'CH13', 'NIVEL_ED', 'ESTADO', 'CAT_OCUP', 'CAT_INAC'
]

df_ind = bases_individuales[clave][variables_individuo].copy()
df_ind['CH11'] = df_ind['CH11'].replace(0, 9)
df_ind['CH12'] = df_ind['CH12'].replace(0, 1)

rangos_validos_ind = {
    'COMPONENTE': (1, 20), 'CH04': (1, 2), 'CH06': (0, 105),
    'CH07': (1, 5), 'CH08': [1, 2, 3, 4, 9, 12, 13, 23, 123],
    'CH09': (1, 3), 'CH10': (1, 3), 'CH11': [1, 2, 9],
    'CH12': (1, 9), 'CH13': (0, 2), 'NIVEL_ED': (1, 7),
    'ESTADO': (1, 4), 'CAT_OCUP': (0, 4), 'CAT_INAC': (0, 7)
}

for var, criterio in rangos_validos_ind.items():
    if isinstance(criterio, tuple):
        df_ind = df_ind[(df_ind[var] >= criterio[0]) & (df_ind[var] <= criterio[1])]
    elif isinstance(criterio, list):
        df_ind = df_ind[df_ind[var].isin(criterio)]

df_ind

Unnamed: 0,CODUSU,COMPONENTE,PONDERA,CH04,CH06,CH07,CH08,CH09,CH10,CH11,CH12,CH13,NIVEL_ED,ESTADO,CAT_OCUP,CAT_INAC
0,TQRMNOSUWHLMKQCDEFNFF00852219,1,370,1,60,2,1,1,2,9,7,1,6,1,3,0
1,TQRMNOSUWHLMKQCDEFNFF00852219,2,370,2,52,2,1,1,2,9,4,1,4,3,0,4
2,TQRMNOTXQHKOLMCDEGNFJ00794590,1,281,1,34,1,1,1,2,9,4,1,4,1,3,0
3,TQRMNOTXQHKOLMCDEGNFJ00794590,2,281,2,32,1,1,1,2,9,4,1,4,1,3,0
4,TQRMNOTXQHKOLMCDEGNFJ00794590,3,281,2,6,5,1,1,1,1,2,2,1,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47332,TQRMNORXTHLOLSCDEGJBF00853031,1,634,1,80,1,1,2,2,9,2,2,1,3,0,7
47333,TQRMNORXTHLOLSCDEGJBF00853031,2,634,2,47,5,1,1,2,9,4,1,4,1,3,0
47334,TQRMNORXTHLOLSCDEGJBF00853031,3,634,1,42,5,4,1,2,9,2,2,1,1,2,0
47335,TQRMNORXTHLOLSCDEGJBF00853031,4,634,2,58,1,4,1,2,9,2,2,1,3,0,7


###3.2 Conformacion base de interes

In [12]:
# Unir las bases por CODUSU
df_merged = pd.merge(df_ind, df_hogar[['CODUSU', 'IX_TOT', 'II2']], on='CODUSU', how='inner')

# Filtrar miembros mayores de 6 años
df_educativos = df_merged[df_merged['CH06'] > 6].copy()

def calcular_escolaridad(row):
    nivel = row['CH12']
    finalizo = row['CH13']
    ultimo_ano = row.get('CH14', np.nan)

    if finalizo == 1:  # Finalizó el nivel
        if nivel == 1:  # jardín
            return 5
        elif nivel == 2:  # primario
            return 12
        elif nivel == 4 or nivel == 5:  # secundario / polimodal
            return 17
        elif nivel == 6 or nivel == 7:  # terciario / universitario
            return 21
        elif nivel == 8:  # posgrado
            return 23
        else:
            return np.nan
    elif finalizo == 2:  # No finalizó
        if nivel in [4, 5] and pd.notnull(ultimo_ano) and ultimo_ano < 98:
            return 7 + int(ultimo_ano)  # asumo primaria (6) + año cursado
        elif nivel == 3 and pd.notnull(ultimo_ano) and ultimo_ano < 98:
            return 6 + int(ultimo_ano)
    return np.nan

df_educativos['ESCOL_REAL'] = df_educativos.apply(calcular_escolaridad, axis=1)

df_educativos['ESCOL_ESP'] = df_educativos['CH06'] - 5
df_educativos['ESCOL_ESP'] = df_educativos['ESCOL_ESP'].clip(lower=0, upper=15)

df_educativos['IEAE'] = df_educativos['ESCOL_REAL'] / df_educativos['ESCOL_ESP']
df_educativos['IEAE'] = df_educativos['IEAE'].clip(upper=1)  # IEAE no puede ser mayor a 1

# NE: promedio de IEAE por hogar
ne_por_hogar = df_educativos.groupby('CODUSU')['IEAE'].mean().reset_index().rename(columns={'IEAE': 'NE'})

# EDA: edad promedio por hogar
edad_por_hogar = df_merged.groupby('CODUSU')['CH06'].mean().reset_index().rename(columns={'CH06': 'EDA'})

# FEM: proporción de mujeres por hogar
fem_por_hogar = df_merged.groupby('CODUSU')['CH04'].apply(lambda x: (x == 2).sum() / len(x)).reset_index().rename(columns={'CH04': 'FEM'})

# ANALFABAT: proporción de analfabetos por hogar
analfa_por_hogar = df_merged.groupby('CODUSU')['CH09'].apply(lambda x: (x == 2).sum() / len(x)).reset_index().rename(columns={'CH09': 'ANALFABAT'})

# Merge todas las variables
df_vars = df_hogar.copy()
df_vars = df_vars.merge(ne_por_hogar, on='CODUSU', how='left')
df_vars = df_vars.merge(edad_por_hogar, on='CODUSU', how='left')
df_vars = df_vars.merge(fem_por_hogar, on='CODUSU', how='left')
df_vars = df_vars.merge(analfa_por_hogar, on='CODUSU', how='left')

# HA: índice de hacinamiento
df_vars['HA'] = df_vars['IX_TOT'] / df_vars['II2']

# Exportar df_vars a Excel
df_vars.to_excel("df_vars.xlsx", index=False)

df_vars

Unnamed: 0,CODUSU,PONDERA,IV1,IV2,IV3,IV4,IV5,IV6,IV7,IV8,...,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4,NE,EDA,FEM,ANALFABAT,HA
0,TQRMNORSQHKMKUCDEIGED00791045,107,1,4,1,1,1,1,1,1,...,98,98,98,98,98,1.000000,18.666667,1.000000,0.000000,1.500000
1,TQRMNOPVSHJMLNCDEIGED00801502,137,1,3,1,1,2,1,1,1,...,98,2,98,98,98,1.000000,67.500000,0.500000,0.000000,2.000000
2,TQRMNORTYHMLKPCDEHJGH00810920,356,2,4,1,9,1,1,1,1,...,98,1,98,98,98,1.000000,28.333333,0.666667,0.000000,2.000000
3,TQRMNOQTWHKLKQCDEHJGH00790284,299,1,6,1,3,1,1,1,1,...,98,2,98,98,98,1.000000,40.000000,0.500000,0.000000,3.000000
4,TQRMNORQVHLLKRCDEHJGH00853836,221,1,5,1,2,1,1,1,1,...,98,96,98,98,98,1.000000,31.500000,1.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16037,TQRMNOSXVHKMLMCDEFMDB00787973,180,1,4,1,1,1,1,1,1,...,98,2,3,4,98,,31.750000,0.750000,0.000000,1.333333
16038,TQRMNOQQYHLOLMCDEFMDB00852214,170,2,3,1,1,1,1,1,1,...,98,98,98,98,98,0.800000,73.500000,0.500000,0.000000,2.000000
16039,TQRMNOUQYHLOLNCDEFMDB00852215,249,1,2,1,4,2,1,1,1,...,98,98,98,98,98,,43.666667,0.666667,0.000000,1.500000
16040,TQRMNOSXQHJMKQCDEGKDB00794419,92,1,5,1,4,1,1,1,1,...,7,8,3,98,98,0.914286,30.454545,0.545455,0.090909,3.000000


##4. Regresion
$${NE}_i = f(X_i) + e_i$$

###4.1 MCO con seleccion de variables

In [None]:
# Preparar datos para la regresión

# Filtrar hogares con NE definido
df_modelo = df_hogar_interes[df_hogar_interes['NE'].notnull()].copy()

# Variables explicativas (X)
X = df_modelo[[
    'hacinamiento',
    'ITF', 'IPCF',
    'proporcion_mujeres',
    'promedio_edad',
    'proporcion_analfabetismo',
    'proporcion_desocupados'
]].copy()

# Limpiar comas decimales y convertir a float
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(",", ".", regex=False)
    X[col] = pd.to_numeric(X[col], errors="coerce")

# Variable dependiente
y = df_modelo['NE']

# Eliminar filas con valores faltantes
df_modelo_limpio = pd.concat([X, y], axis=1).dropna()
X = df_modelo_limpio.drop(columns='NE')
y = df_modelo_limpio['NE']

In [None]:
# Crear y entrenar modelo con Elastic Net

# Separar conjunto de entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar pipeline con estandarización + ElasticNetCV
modelo = make_pipeline(
    StandardScaler(),
    ElasticNetCV(cv=5, l1_ratio=[.1, .5, .7, .9, .95, 1], random_state=42)
)

modelo.fit(X_train, y_train)

In [None]:
# Evaluar el modelo

y_pred = modelo.predict(X_test)
print("R²:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Mostrar coeficientes
coefs = modelo.named_steps['elasticnetcv'].coef_
for var, coef in zip(X.columns, coefs):
    print(f"{var}: {coef:.4f}")

In [None]:
# Mostrar los coeficientes seleccionados

# Obtener coeficientes del modelo entrenado
coefs = modelo.named_steps['elasticnetcv'].coef_
variables = X.columns

print("Coeficientes del modelo (Elastic Net):")
for var, coef in zip(variables, coefs):
    print(f"{var}: {coef:.4f}")

###4.2 Bosques aleatorios

In [None]:
# Preparar datos

# Filtrar hogares con NE definido
df_modelo = df_hogar_interes[df_hogar_interes['NE'].notnull()].copy()

# Variables independientes (X)
X = df_modelo[[
    'hacinamiento',
    'ITF', 'IPCF',
    'proporcion_mujeres',
    'promedio_edad',
    'proporcion_analfabetismo',
    'proporcion_desocupados'
]].copy()

# Convertir comas a puntos y asegurar tipo float
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(",", ".", regex=False)
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Variable dependiente
y = df_modelo['NE']

# Eliminar filas con valores faltantes
df_modelo_limpio = pd.concat([X, y], axis=1).dropna()
X = df_modelo_limpio.drop(columns='NE')
y = df_modelo_limpio['NE']

In [None]:
# Entrenar modelo

# Dividir datos en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y ajustar el modelo
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Evaluar modelo

# Predicciones
y_pred = rf_model.predict(X_test)

# Métricas
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² del modelo: {r2:.4f}")
print(f"RMSE del modelo: {rmse:.4f}")

In [None]:
# Relevancia de variables

# Mostrar importancia de variables
importancia = rf_model.feature_importances_

print("\nRelevancia de variables:")
for var, imp in sorted(zip(X.columns, importancia), key=lambda x: x[1], reverse=True):
    print(f"{var}: {imp:.4f}")

###4.3 Support Vector Regressor (SVR)

In [None]:
# Preparar datos

# Filtrar hogares con NE definido
df_modelo = df_hogar_interes[df_hogar_interes['NE'].notnull()].copy()

# Variables independientes (X)
X = df_modelo[[
    'hacinamiento',
    'ITF', 'IPCF',
    'proporcion_mujeres',
    'promedio_edad',
    'proporcion_analfabetismo',
    'proporcion_desocupados'
]].copy()

# Corregir comas y convertir a float
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(",", ".", regex=False)
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Variable dependiente
y = df_modelo['NE']

# Eliminar filas con valores faltantes
df_modelo_limpio = pd.concat([X, y], axis=1).dropna()
X = df_modelo_limpio.drop(columns='NE')
y = df_modelo_limpio['NE']

In [None]:
# Entrenar modelo

# Dividir en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear pipeline: escalado + SVR
svr_model = make_pipeline(
    StandardScaler(),
    SVR(kernel='rbf', C=1.0, epsilon=0.1)
)

# Entrenar modelo
svr_model.fit(X_train, y_train)

In [None]:
# Evaluar modelo

# Predicciones
y_pred = svr_model.predict(X_test)

# Métricas
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² del modelo SVR: {r2:.4f}")
print(f"RMSE del modelo SVR: {rmse:.4f}")

##5. Clasificacion
$$PE_i = \mathbb{I}({NE}_i<\tau)$$

$\tau$: Mediana

###5.1 Bosques aleatorios

In [None]:
# Crear variable PE_i

# Usar hogares con NE definido
df_clasificacion = df_hogar_interes[df_hogar_interes['NE'].notnull()].copy()

# Calcular mediana de NE
tau = df_clasificacion['NE'].median()

# Crear variable binaria: 1 si NE < mediana (pobreza educativa)
df_clasificacion['PE_i'] = (df_clasificacion['NE'] < tau).astype(int)

In [None]:
# Preparar datos

# Variables explicativas
X = df_clasificacion[[
    'hacinamiento',
    'ITF', 'IPCF',
    'proporcion_mujeres',
    'promedio_edad',
    'proporcion_analfabetismo',
    'proporcion_desocupados'
]].copy()

# Corregir comas y convertir a float
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(",", ".", regex=False)
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Variable dependiente
y = df_clasificacion['PE_i']

# Eliminar filas con valores faltantes
df_clasificacion_limpio = pd.concat([X, y], axis=1).dropna()
X = df_clasificacion_limpio.drop(columns='PE_i')
y = df_clasificacion_limpio['PE_i']

In [None]:
# Entrenar modelo

# Separar en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar modelo
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Evaluar modelo

# Predicciones
y_pred = clf.predict(X_test)

# Métricas de evaluación
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Relevancia de variables

print("\nRelevancia de variables:")
for var, imp in sorted(zip(X.columns, clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"{var}: {imp:.4f}")

###5.2 Support Vector Machine (SVM)

In [None]:
# Entrenar el modelo

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear pipeline con estandarización y SVM (con kernel RBF)
svm_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
)

# Entrenar
svm_model.fit(X_train, y_train)

In [None]:
# Evaluar el modelo

# Predicciones
y_pred = svm_model.predict(X_test)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))