In [11]:
from pathlib import Path
import pandas as pd

# Ruta correcta: sube un nivel y entra a data/processed
DATA_PATH = Path("../data/processed/dataset_churn_T0_2024-12-31_L180_P90.parquet")

df = pd.read_parquet(DATA_PATH)
print(df.shape)
df.head()



(2357, 23)


Unnamed: 0,client_id,registration_date,freq_orders,monetary_total,monetary_avg,monetary_max,last_order_ts,first_order_ts,avg_shipping_cost,recency_days,...,uniq_brands,cadence_avg_days,orders_last30,amt_last30,pct_express,pct_standard,pct_card,pct_cash,pct_wallet,churn
0,21e5c13d-1c9a-4d00-9164-b72302d5edef,2025-05-02,11.0,686.69,62.426364,98.22,2024-12-16 21:01:32,2024-07-08 21:30:33,5.005455,14.0,...,8.0,15.5,1.0,97.97,0.272727,0.545455,0.545455,0.181818,0.0,1
1,36e48bdd-db11-4abe-9526-cfc90e68924d,2023-02-24,30.0,1592.91,53.097,94.07,2024-12-26 10:58:03,2024-07-05 09:26:46,6.133333,4.0,...,11.0,5.586207,7.0,402.74,0.233333,0.733333,0.766667,0.033333,0.0,1
2,145c22df-3579-412e-bc12-b4fce70abaf3,2024-10-03,8.0,392.31,49.03875,94.47,2024-12-29 22:46:06,2024-07-12 21:36:47,5.1475,1.0,...,8.0,24.0,2.0,66.49,0.125,0.875,0.75,0.125,0.0,1
3,90c4a925-e51f-4dac-9193-2d9aec97a472,2025-01-13,5.0,351.12,70.224,95.89,2024-11-19 02:28:56,2024-09-10 01:53:00,3.538,41.0,...,4.0,17.0,0.0,0.0,0.2,0.4,0.6,0.2,0.0,1
4,853f711a-4c36-40b4-b5d0-6207152cd793,2025-03-09,0.0,0.0,0.0,0.0,NaT,NaT,0.0,-80.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [12]:
df.columns

Index(['client_id', 'registration_date', 'freq_orders', 'monetary_total',
       'monetary_avg', 'monetary_max', 'last_order_ts', 'first_order_ts',
       'avg_shipping_cost', 'recency_days', 'tenure_days', 'uniq_products',
       'uniq_categories', 'uniq_brands', 'cadence_avg_days', 'orders_last30',
       'amt_last30', 'pct_express', 'pct_standard', 'pct_card', 'pct_cash',
       'pct_wallet', 'churn'],
      dtype='object')

In [13]:
from datetime import timedelta

# Verificamos las columnas de fechas reales
df[['client_id', 'last_order_ts', 'registration_date']].head()

# Definimos churn como clientes inactivos en los últimos 90 días antes de la fecha de corte
# (suponemos que la fecha de referencia es la más reciente del dataset)
reference_date = df['last_order_ts'].max()
df['churn'] = (reference_date - df['last_order_ts']).dt.days > 90
df['churn'] = df['churn'].astype(int)  # 1 = churn, 0 = activo

# Revisamos la distribución
df['churn'].value_counts(normalize=True)



churn
0    0.911752
1    0.088248
Name: proportion, dtype: float64

In [14]:
# Target
y = df["churn"]

# Features numéricas: tomamos todas las numéricas y sacamos id y churn
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
for col in ["client_id", "churn"]:
    if col in numeric_cols:
        numeric_cols.remove(col)

X = df[numeric_cols]

X.head(), y.value_counts(normalize=True)


(   freq_orders  monetary_total  monetary_avg  monetary_max  avg_shipping_cost  \
 0         11.0          686.69     62.426364         98.22           5.005455   
 1         30.0         1592.91     53.097000         94.07           6.133333   
 2          8.0          392.31     49.038750         94.47           5.147500   
 3          5.0          351.12     70.224000         95.89           3.538000   
 4          0.0            0.00      0.000000          0.00           0.000000   
 
    recency_days  tenure_days  uniq_products  uniq_categories  uniq_brands  \
 0          14.0       -122.0           11.0              4.0          8.0   
 1           4.0        676.0           30.0              5.0         11.0   
 2           1.0         89.0            8.0              4.0          8.0   
 3          41.0        -13.0            5.0              3.0          4.0   
 4         -80.0        -68.0            0.0              0.0          0.0   
 
    cadence_avg_days  orders_last30 

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # mantiene proporciones de churn
)
X_train.shape, X_test.shape


((1885, 18), (472, 18))

In [16]:
X.isna().sum().sort_values(ascending=False).head(15)


cadence_avg_days     591
freq_orders            0
monetary_total         0
pct_cash               0
pct_card               0
pct_standard           0
pct_express            0
amt_last30             0
orders_last30          0
uniq_brands            0
uniq_categories        0
uniq_products          0
tenure_days            0
recency_days           0
avg_shipping_cost      0
dtype: int64

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # arregla cadence_avg_days y cualquier otro NaN futuro
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

log_reg.fit(X_train, y_train)


0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [18]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Precision:", round(precision_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("F1:", round(f1_score(y_test, y_pred), 3))
print("ROC-AUC:", round(roc_auc_score(y_test, y_proba), 3))

print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de clasificación:\n")
print(classification_report(y_test, y_pred, digits=3))


Accuracy: 0.894
Precision: 0.456
Recall: 0.976
F1: 0.621
ROC-AUC: 0.97

Matriz de confusión:
 [[381  49]
 [  1  41]]

Reporte de clasificación:

              precision    recall  f1-score   support

           0      0.997     0.886     0.938       430
           1      0.456     0.976     0.621        42

    accuracy                          0.894       472
   macro avg      0.726     0.931     0.780       472
weighted avg      0.949     0.894     0.910       472



In [19]:
import numpy as np
threshold = 0.3
y_pred_custom = (y_proba >= threshold).astype(int)
print(classification_report(y_test, y_pred_custom, digits=3))


              precision    recall  f1-score   support

           0      1.000     0.823     0.903       430
           1      0.356     1.000     0.525        42

    accuracy                          0.839       472
   macro avg      0.678     0.912     0.714       472
weighted avg      0.943     0.839     0.869       472



In [20]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [21]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report
)

# Probabilidades de churn (clase 1)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# 1) Predicción con umbral clásico 0.5
y_pred_05 = (y_proba >= 0.5).astype(int)

print("=== Umbral 0.5 ===")
print("Accuracy:", accuracy_score(y_test, y_pred_05))
print("Precision:", precision_score(y_test, y_pred_05))
print("Recall:", recall_score(y_test, y_pred_05))
print("F1:", f1_score(y_test, y_pred_05))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_05))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_05))


=== Umbral 0.5 ===
Accuracy: 0.8940677966101694
Precision: 0.45555555555555555
Recall: 0.9761904761904762
F1: 0.6212121212121212
ROC-AUC: 0.9699889258028793

Matriz de confusión:
 [[381  49]
 [  1  41]]

Reporte de clasificación:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94       430
           1       0.46      0.98      0.62        42

    accuracy                           0.89       472
   macro avg       0.73      0.93      0.78       472
weighted avg       0.95      0.89      0.91       472



In [22]:
# 2) Predicción con umbral más agresivo (ej 0.3)
threshold = 0.3
y_pred_03 = (y_proba >= threshold).astype(int)

print("=== Umbral 0.3 ===")
print("Accuracy:", accuracy_score(y_test, y_pred_03))
print("Precision:", precision_score(y_test, y_pred_03))
print("Recall:", recall_score(y_test, y_pred_03))
print("F1:", f1_score(y_test, y_pred_03))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_03))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred_03))


=== Umbral 0.3 ===
Accuracy: 0.8389830508474576
Precision: 0.3559322033898305
Recall: 1.0
F1: 0.525

Matriz de confusión:
 [[354  76]
 [  0  42]]

Reporte de clasificación:
               precision    recall  f1-score   support

           0       1.00      0.82      0.90       430
           1       0.36      1.00      0.53        42

    accuracy                           0.84       472
   macro avg       0.68      0.91      0.71       472
weighted avg       0.94      0.84      0.87       472



In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

rf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)

y_proba_rf = rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_proba_rf >= 0.5).astype(int)

print("=== Random Forest (umbral 0.5) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest (umbral 0.5) ===
Accuracy: 0.9978813559322034
Precision: 1.0
Recall: 0.9761904761904762
F1: 0.9879518072289156
ROC-AUC: 1.0

Matriz de confusión:
 [[430   0]
 [  1  41]]
