In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Cargar el dataset
df = pd.read_csv('../data/Telco-Customer-Churn.csv')

# Eliminar la columna 'Churn' para no saber quién desertó
df_without_churn = df.drop(columns="Churn")

# Limpiar valores nulos
df_without_churn['TotalCharges'] = pd.to_numeric(df_without_churn['TotalCharges'], errors='coerce')
df_without_churn['TotalCharges'].fillna(df_without_churn['TotalCharges'].mean(), inplace=True)

# Codificación de variables categóricas
label_encoder = LabelEncoder()
binary_cols = [col for col in df_without_churn.select_dtypes(include='object').columns.tolist() if df_without_churn[col].nunique() == 2]

for col in binary_cols:
    df_without_churn[col] = label_encoder.fit_transform(df_without_churn[col])

df_without_churn = pd.get_dummies(df_without_churn, drop_first=True)

# Escalado de variables numéricas
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df_without_churn[numeric_cols] = scaler.fit_transform(df_without_churn[numeric_cols])

# Dividir en conjunto de entrenamiento y prueba
X = df_without_churn
y = df['Churn']  # La columna 'Churn' original será nuestra variable objetivo

# Convertir 'Yes'/'No' a 1/0 en la columna 'Churn'
y = y.map({'Yes': 1, 'No': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

# ==========================================
# Paso 3: Entrenar los modelos (XGBoost y Random Forest)
# ==========================================
# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
xgb_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# ==========================================
# Paso 4: Realizar predicciones
# ==========================================
# Predicciones de probabilidad (predict_proba) para XGBoost
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Predicciones de probabilidad (predict_proba) para Random Forest
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Predicciones de clase
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# ==========================================
# Paso 5: Guardar los modelos entrenados
# ==========================================
joblib.dump(xgb_model, '../modelos/xgboost_model.pkl')
joblib.dump(rf_model, '../modelos/random_forest_model.pkl')

# ==========================================
# Paso 6: Guardar los resultados de las predicciones en un archivo CSV
# ==========================================
df_predicciones = pd.DataFrame({
    'customerID': df.loc[X_test.index, 'customerID'],
    'Probabilidad_Churn_XGBoost': y_prob_xgb,
    'Probabilidad_Churn_RF': y_prob_rf,
    'Riesgo_Churn_XGBoost': y_pred_xgb,
    'Riesgo_Churn_RF': y_pred_rf
})

# Exportamos los resultados a un archivo CSV
df_predicciones.to_csv('../report/predicciones_comparativas.csv', index=False)

# ==========================================
# Paso 7: Evaluación de los modelos
# ==========================================
# Evaluación de XGBoost
print("Reporte de clasificación para XGBoost:")
print(classification_report(y_test, y_pred_xgb))
print(f"AUC-ROC para XGBoost: {roc_auc_score(y_test, y_prob_xgb):.4f}")
print("Matriz de confusión para XGBoost:")
print(confusion_matrix(y_test, y_pred_xgb))

# Evaluación de Random Forest
print("\nReporte de clasificación para Random Forest:")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC para Random Forest: {roc_auc_score(y_test, y_prob_rf):.4f}")
print("Matriz de confusión para Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_without_churn['TotalCharges'].fillna(df_without_churn['TotalCharges'].mean(), inplace=True)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Reporte de clasificación para XGBoost:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1552
           1       0.59      0.51      0.55       561

    accuracy                           0.78      2113
   macro avg       0.71      0.69      0.70      2113
weighted avg       0.77      0.78      0.77      2113

AUC-ROC para XGBoost: 0.8185
Matriz de confusión para XGBoost:
[[1355  197]
 [ 274  287]]

Reporte de clasificación para Random Forest:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1552
           1       0.67      0.46      0.55       561

    accuracy                           0.80      2113
   macro avg       0.75      0.69      0.71      2113
weighted avg       0.78      0.80      0.78      2113

AUC-ROC para Random Forest: 0.8346
Matriz de confusión para Random Forest:
[[1426  126]
 [ 302  259]]
