In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    average_precision_score, confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
SNAP_PATH = "../data/customer_snapshot_ml.csv"
snap = pd.read_csv(SNAP_PATH)
assert "churn" in snap.columns, "Snapshot must contain 'churn'."
y = snap["churn"].astype(int).values

In [None]:
LEAKY_COLS = ["churn", "has_future_purchase"]
LEAKY_COLS += [c for c in snap.columns if "future" in c.lower()]
ID_COLS = ["customer_id"]
X_base = (
    snap.drop(columns=[c for c in (LEAKY_COLS + ID_COLS) if c in snap.columns], errors="ignore")
        .select_dtypes(include=[np.number])
        .copy()
)
X_base = X_base.replace([np.inf, -np.inf], np.nan)
if X_base.isna().any().any():
    X_base = X_base.fillna(X_base.median(numeric_only=True))
print(f"Loaded {SNAP_PATH} | rows={len(snap)} | churn_rate={y.mean():.3f}")
print(f"Feature shape (numeric only, leak-free): {X_base.shape}")

In [None]:
idx = np.arange(len(y))
train_idx, test_idx, y_train, y_test = train_test_split(
    idx, y, test_size=0.20, random_state=42, stratify=y
)
X_train = X_base.iloc[train_idx].copy()
X_test  = X_base.iloc[test_idx].copy()

In [None]:
HEAVY_TAIL_COLS = [
    "monetary_sum","monetary_median","monetary_max","aov",
    "orders_per_30d","monetary_per_30d",
    "avg_price","avg_quantity",
    "tenure_days","recency_days",
    "unique_products","unique_categories"
]

def fit_caps(X, cols, lower=0.01, upper=0.99):
    caps = {}
    for c in cols:
        if c in X.columns:
            ql, qu = X[c].quantile([lower, upper])
            if np.isfinite(ql) and np.isfinite(qu) and ql < qu:
                caps[c] = (float(ql), float(qu))
    return caps

def apply_caps(X, caps):
    Xc = X.copy()
    for c,(lo,hi) in caps.items():
        if c in Xc.columns:
            Xc[c] = Xc[c].clip(lower=lo, upper=hi)
    return Xc

caps = fit_caps(X_train, [c for c in HEAVY_TAIL_COLS if c in X_train.columns], 0.01, 0.99)
X_train = apply_caps(X_train, caps)
X_test  = apply_caps(X_test, caps)

In [None]:
lr_pipe = Pipeline([
    ("scaler", RobustScaler(with_centering=True, with_scaling=True)),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", solver="lbfgs"))
])

svm_pipe = Pipeline([
    ("scaler", RobustScaler(with_centering=True, with_scaling=True)),
    ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42))
])

dt = DecisionTreeClassifier(class_weight="balanced", random_state=42)
rf = RandomForestClassifier(
    n_estimators=400, class_weight="balanced_subsample", random_state=42, n_jobs=-1
)

In [None]:
def evaluate_model(name, model, Xtr, ytr, Xte, yte, keep_proba=True):
    model.fit(Xtr, ytr)
    y_pred = model.predict(Xte)
    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(Xte)[:, 1]
    elif hasattr(model, "decision_function"):
        scores = model.decision_function(Xte)
    else:
        scores = y_pred.astype(float)
    acc = accuracy_score(yte, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(yte, y_pred, average="binary", zero_division=0)
    try:
        roc = roc_auc_score(yte, scores)
        pr_auc = average_precision_score(yte, scores)
    except Exception:
        roc, pr_auc = np.nan, np.nan
    cm = confusion_matrix(yte, y_pred)
    report = classification_report(yte, y_pred, digits=3, zero_division=0)
    out = {
        "model": name, "accuracy": acc, "precision": prec, "recall": rec,
        "f1": f1, "roc_auc": roc, "pr_auc": pr_auc,
        "confusion_matrix": cm, "report": report
    }
    if keep_proba:
        out["scores"] = scores
    return out

In [None]:
models = [
    ("LogisticRegression", lr_pipe),
    ("SVM_RBF",            svm_pipe),
    ("DecisionTree",       dt),
    ("RandomForest",       rf),
]

results = []
for name, mdl in models:
    res = evaluate_model(name, mdl, X_train, y_train, X_test, y_test, keep_proba=True)
    results.append(res)
    print("\n" + "="*70)
    print(f"Model: {res['model']}")
    print(f"Accuracy: {res['accuracy']:.4f} | Precision: {res['precision']:.4f} | Recall: {res['recall']:.4f} | F1: {res['f1']:.4f}")
    print(f"ROC-AUC: {res['roc_auc']:.4f} | PR-AUC: {res['pr_auc']:.4f}")
    print("Confusion Matrix [[TN, FP], [FN, TP]]:\n", res["confusion_matrix"])
    print("\nClassification Report:\n", res["report"])
summary_df = pd.DataFrame([{
    "model": r["model"], "accuracy": r["accuracy"], "precision": r["precision"],
    "recall": r["recall"], "f1": r["f1"], "roc_auc": r["roc_auc"], "pr_auc": r["pr_auc"]
} for r in results]).sort_values("accuracy", ascending=False)
print("\n" + "#"*70)
print("MODEL COMPARISON (sorted by Accuracy)")
print(summary_df.to_string(index=False))

In [None]:
import pickle
with open('../models/random_forest.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('../models/decision_tree.pkl', 'wb') as f:
    pickle.dump(dt, f)
with open('../models/logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr_pipe, f)
with open('../models/svm_rbf.pkl', 'wb') as f:
    pickle.dump(svm_pipe, f)
print('Saved models: random_forest.pkl, decision_tree.pkl, logistic_regression.pkl, svm_rbf.pkl')

In [None]:
# Predict churn for all customers using trained random forest model
from sklearn.preprocessing import RobustScaler

# Prepare features for all customers (same preprocessing as training)
X_all = (
    snap.drop(columns=[c for c in (LEAKY_COLS + ID_COLS) if c in snap.columns], errors="ignore")
        .select_dtypes(include=[np.number])
        .copy()
)
X_all = X_all.replace([np.inf, -np.inf], np.nan)
if X_all.isna().any().any():
    X_all = X_all.fillna(X_all.median(numeric_only=True))

# Apply caps (from training) to all data
X_all = apply_caps(X_all, caps)

# Predict churn using trained random forest
pred_churn = rf.predict(X_all)

# Add predicted churn to snapshot
snap_pred = snap.copy()
snap_pred['churn_pred'] = pred_churn

# Export dashboard CSV using predicted churn
churn_map = {0: 'No', 1: 'Yes'}
churn_dashboard_pred = pd.DataFrame({
    'customer_id': snap_pred['customer_id'],
    'churn': snap_pred['churn_pred'].map(churn_map)
})
churn_dashboard_pred.to_csv('../data/dashboards/churn_dashboard.csv', index=False)
print('Exported predicted churn to ../data/dashboards/churn_dashboard.csv')