In [16]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import numpy as np

In [17]:
# Cargar el archivo CSV
data_lasso = pd.read_csv(r"../data/final/1_panel/3_modelling/df_model_post_tratamiento_estadistico.csv")
data_lasso = data_lasso.dropna()
data_lasso.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24232 entries, 0 to 24231
Columns: 136 entries, status_inf to ratiodep
dtypes: bool(127), float64(4), int64(5)
memory usage: 4.6 MB


In [21]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, recall_score, f1_score, accuracy_score, precision_score
import numpy as np

# Variable objetivo
y = data_lasso['status_inf']

# Variables explicativas
X = data_lasso.drop(columns=['status_inf'])

# Escalar las características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Ajustar el modelo Lasso con un valor fijo de alpha
alpha_value = 0.01  # Ajusta este valor según lo necesario
lasso = Lasso(alpha=alpha_value, random_state=42)
lasso.fit(X_scaled, y)

# Selección de variables útiles (coeficientes ≠ 0)
coef = pd.Series(lasso.coef_, index=X.columns)
coef_no_cero = coef[coef != 0]
print("Número de variables seleccionadas:", coef_no_cero.shape[0])

# Variables descartadas
coef_cero = coef[coef == 0]

# Mostrar las variables seleccionadas
print(coef_no_cero.sort_values())

# Lista de variables útiles seleccionadas
variables_utiles = coef_no_cero.index.tolist()

# Usar las variables seleccionadas para el modelo Logit
X_lasso_selected = data_lasso[variables_utiles]

# Escalar las características seleccionadas
X_lasso_scaled = scaler.fit_transform(X_lasso_selected)

# Inicializar el modelo Logit con balance de clases
logit_bal = LogisticRegression(class_weight="balanced", max_iter=5000)

# Validación cruzada de 4 pliegues
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Inicializar listas para almacenar las métricas de cada pliegue
auc_cv_scores = []
accuracy_cv_scores = []
precision_class_1_cv_scores = []  # Guardaremos la precision para la clase 1
precision_macro_cv_scores = []  # Guardaremos la precision macro (promedio)
recall_class_1_cv_scores = []  # Guardaremos el recall para la clase 1
recall_macro_cv_scores = []  # Guardaremos el recall macro (promedio)
f1_class_1_cv_scores = []  # F1-Score para clase 1
f1_macro_cv_scores = []  # F1-Score macro
f1_cv_scores = []
recall_cv_scores = []

# Recalcular las métricas para cada pliegue
for train_idx, test_idx in cv.split(X_lasso_scaled, y):
    X_train, X_test = X_lasso_scaled[train_idx], X_lasso_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Entrenar el modelo en el pliegue de entrenamiento
    logit_bal.fit(X_train, y_train)
    
    # Obtener las probabilidades predichas para AUC
    y_proba_test = logit_bal.predict_proba(X_test)[:, 1]
    
    # Calcular AUC para este pliegue
    auc_cv_scores.append(roc_auc_score(y_test, y_proba_test))
    
    # Obtener las predicciones para las métricas de clasificación
    y_pred_test = (y_proba_test >= umbral).astype(int)
    
    # Calcular Accuracy, Precision, Recall y F1-Score para todo el conjunto
    accuracy_cv_scores.append(accuracy_score(y_test, y_pred_test))
    
    # Calcular Precision para la clase 1
    precision_class_1 = precision_score(y_test, y_pred_test, pos_label=1)
    precision_class_1_cv_scores.append(precision_class_1)
    
    # Calcular Precision macro (promedio)
    precision_macro = precision_score(y_test, y_pred_test, average='macro')
    precision_macro_cv_scores.append(precision_macro)
    
    # Calcular Recall para la clase 1
    recall_class_1 = recall_score(y_test, y_pred_test, pos_label=1)
    recall_class_1_cv_scores.append(recall_class_1)
    
    # Calcular Recall macro (promedio)
    recall_macro = recall_score(y_test, y_pred_test, average='macro')
    recall_macro_cv_scores.append(recall_macro)
    
    # Calcular F1-Score para la clase 1
    f1_class_1 = f1_score(y_test, y_pred_test, pos_label=1)
    f1_class_1_cv_scores.append(f1_class_1)
    
    # Calcular F1-Score macro (promedio)
    f1_macro = f1_score(y_test, y_pred_test, average='macro')
    f1_macro_cv_scores.append(f1_macro)
    
    # Calcular F1-Score general
    f1_cv_scores.append(f1_score(y_test, y_pred_test))
    
    # Calcular Recall general
    recall_cv_scores.append(recall_score(y_test, y_pred_test))
    
    # Mostrar classification report para cada pliegue
    print(f"\nClassification Report para este pliegue:\n", classification_report(y_test, y_pred_test))

# Calcular la media y desviación estándar de las métricas
mean_auc = np.mean(auc_cv_scores)
std_auc = np.std(auc_cv_scores)

mean_accuracy = np.mean(accuracy_cv_scores)
std_accuracy = np.std(accuracy_cv_scores)

mean_precision_class_1 = np.mean(precision_class_1_cv_scores)
std_precision_class_1 = np.std(precision_class_1_cv_scores)

mean_precision_macro = np.mean(precision_macro_cv_scores)
std_precision_macro = np.std(precision_macro_cv_scores)

mean_recall_class_1 = np.mean(recall_class_1_cv_scores)
std_recall_class_1 = np.std(recall_class_1_cv_scores)

mean_recall_macro = np.mean(recall_macro_cv_scores)
std_recall_macro = np.std(recall_macro_cv_scores)

mean_f1_class_1 = np.mean(f1_class_1_cv_scores)
std_f1_class_1 = np.std(f1_class_1_cv_scores)

mean_f1_macro = np.mean(f1_macro_cv_scores)
std_f1_macro = np.std(f1_macro_cv_scores)

mean_f1 = np.mean(f1_cv_scores)
std_f1 = np.std(f1_cv_scores)

# Imprimir los resultados finales con las métricas y sus desviaciones estándar
print(f"\nAUC promedio en validación cruzada: {mean_auc:.3f} ({std_auc:.3f})")
print(f"Accuracy promedio en validación cruzada: {mean_accuracy:.3f} ({std_accuracy:.3f})")
print(f"Precision promedio para la clase 1 en validación cruzada: {mean_precision_class_1:.3f} ({std_precision_class_1:.3f})")
print(f"Precision promedio macro en validación cruzada: {mean_precision_macro:.3f} ({std_precision_macro:.3f})")
print(f"Recall promedio para la clase 1 en validación cruzada: {mean_recall_class_1:.3f} ({std_recall_class_1:.3f})")
print(f"Recall promedio macro en validación cruzada: {mean_recall_macro:.3f} ({std_recall_macro:.3f})")
print(f"F1-score promedio para la clase 1 en validación cruzada: {mean_f1_class_1:.3f} ({std_f1_class_1:.3f})")
print(f"F1-score promedio macro en validación cruzada: {mean_f1_macro:.3f} ({std_f1_macro:.3f})")
print(f"F1-score promedio en validación cruzada: {mean_f1:.3f} ({std_f1:.3f})")


Número de variables seleccionadas: 27
categoria_trabajador_3   -0.027241
ingtrabw                 -0.023981
edad                     -0.017875
trabajopara_2            -0.016890
estadocivil_2            -0.007356
sector_trabajador_3      -0.006025
tuvootrotrabajo_2        -0.002856
area_1                   -0.001698
internet_1               -0.001149
onp_1                    -0.000678
niveleduc_8              -0.000529
hospital_seguro_1        -0.000376
niveleduc_4               0.000014
niveleduc_5               0.001387
pobreza_2                 0.002077
puestosalud_1             0.002327
materialtechos_4          0.003159
sector_trabajador_8       0.005830
personas_ingresos         0.006604
materialpisos_6           0.007035
usointernet_2             0.007140
tipocontrato_2            0.009686
tipocontrato_7            0.010356
sis_1                     0.014623
combustible               0.015280
ciiu_6c_3                 0.019075
categoria_trabajador_2    0.028170
dtype: float64

C