In [1]:
# Notebook 3 — Churn Classification

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             classification_report, confusion_matrix)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
import pickle

RSEED = 42
np.random.seed(RSEED)
pd.set_option("display.max_columns", 200)

In [2]:
def safe_read(path):
    try:
        return pd.read_csv(path)
    except Exception:
        return None

# Notebook 2 outputs
rfm        = safe_read("../data/interim/rfm_features.csv")            # customer_id, recency, frequency, monetary
segments   = safe_read("../data/interim/customer_segments.csv")       # customer_id, cluster

# Notebook 5 outputs
text_feats = safe_read("../data/processed/text_features_customer.csv")# customer_id, sent shares, VADER stats, last sentiment, churn

# Base customer snapshot (from EDA or fallback)
base = safe_read("../data/processed/customer_features.csv")
if base is None:
    raw = safe_read("../data/raw/customer_intelligence_dataset.csv")
    if raw is not None:
        base = (raw.sort_values(["customer_id","sale_date"])
                    .groupby("customer_id")
                    .agg(churn=("churn","max"),
                         age=("age","last"),
                         gender=("gender","last"),
                         region=("region","last"))
                    .reset_index())

frames = [df for df in [base, rfm, segments, text_feats] if df is not None]
if not frames:
    raise RuntimeError("No inputs found. Please export from notebooks 1/2/5 first.")

from functools import reduce
Xy = reduce(lambda l, r: pd.merge(l, r, on="customer_id", how="left"), frames)

# Fix churn in case of merges (churn / churn_x / churn_y)
for c in ["churn", "churn_x", "churn_y"]:
    if c not in Xy.columns:
        Xy[c] = np.nan
Xy["churn"] = (Xy["churn"].fillna(Xy["churn_x"])
                        .fillna(Xy["churn_y"])
                        .astype(float).round().astype(int))
Xy = Xy.drop(columns=[c for c in ["churn_x","churn_y"] if c in Xy.columns])

print("Merged shape:", Xy.shape)
print("Churn balance:", Xy["churn"].value_counts(normalize=True).round(3).to_dict())
Xy.head(10)

Merged shape: (5901, 46)
Churn balance: {0: 0.75, 1: 0.25}


Unnamed: 0,customer_id,first_purchase,last_purchase,total_orders,total_quantity,total_revenue,age,gender,region,segment,tenure_months,unique_products,unique_categories,sentiment_mean,sent_share_negative_x,sent_share_neutral_x,sent_share_positive_x,recency_days,aov,orders_per_month,rev_90d,rev_90d_share,level_1,0,recency,frequency,monetary,recency_scaled,frequency_scaled,monetary_scaled,cluster,segment_label,compound_mean,compound_median,compound_min,compound_max,neg_mean,neu_mean,pos_mean,sentiment_last,sentiment_vader_last,compound_last,sent_share_negative_y,sent_share_neutral_y,sent_share_positive_y,churn
0,CUST00001,2023-05-19,2023-07-18,2,7,249000,43,Female,South,Corporate,50,2,2,0.0,0.5,0.0,0.5,166,124500.0,0.04,0.0,0.0,ip_gap_mean,41.0,166,2,187000.0,-0.178873,-1.08564,-0.077402,0,Segment 2,0.28595,0.28595,0.0,0.5719,0.0,0.6755,0.3245,Positive,Positive,0.5719,0.5,0.0,0.5,0
1,CUST00001,2023-05-19,2023-07-18,2,7,249000,43,Female,South,Corporate,50,2,2,0.0,0.5,0.0,0.5,166,124500.0,0.04,0.0,0.0,ip_gap_median,41.0,166,2,187000.0,-0.178873,-1.08564,-0.077402,0,Segment 2,0.28595,0.28595,0.0,0.5719,0.0,0.6755,0.3245,Positive,Positive,0.5719,0.5,0.0,0.5,0
2,CUST00001,2023-05-19,2023-07-18,2,7,249000,43,Female,South,Corporate,50,2,2,0.0,0.5,0.0,0.5,166,124500.0,0.04,0.0,0.0,ip_gap_std,0.0,166,2,187000.0,-0.178873,-1.08564,-0.077402,0,Segment 2,0.28595,0.28595,0.0,0.5719,0.0,0.6755,0.3245,Positive,Positive,0.5719,0.5,0.0,0.5,0
3,CUST00002,2022-01-19,2023-06-27,4,11,84300,22,Male,East,Corporate,37,4,3,0.75,0.0,0.25,0.75,187,21075.0,0.108108,0.0,0.0,ip_gap_mean,146.333333,187,4,84300.0,0.023153,-0.035245,-0.807727,0,Segment 2,0.05105,-0.00905,-0.4585,0.6808,0.214,0.549,0.237,Positive,Negative,-0.4585,0.0,0.25,0.75,1
4,CUST00002,2022-01-19,2023-06-27,4,11,84300,22,Male,East,Corporate,37,4,3,0.75,0.0,0.25,0.75,187,21075.0,0.108108,0.0,0.0,ip_gap_median,138.0,187,4,84300.0,0.023153,-0.035245,-0.807727,0,Segment 2,0.05105,-0.00905,-0.4585,0.6808,0.214,0.549,0.237,Positive,Negative,-0.4585,0.0,0.25,0.75,1
5,CUST00002,2022-01-19,2023-06-27,4,11,84300,22,Male,East,Corporate,37,4,3,0.75,0.0,0.25,0.75,187,21075.0,0.108108,0.0,0.0,ip_gap_std,110.73542,187,4,84300.0,0.023153,-0.035245,-0.807727,0,Segment 2,0.05105,-0.00905,-0.4585,0.6808,0.214,0.549,0.237,Positive,Negative,-0.4585,0.0,0.25,0.75,1
6,CUST00003,2020-07-18,2023-02-06,4,14,130150,59,Female,North,Corporate,18,3,3,0.25,0.25,0.25,0.5,328,32537.5,0.222222,0.0,0.0,ip_gap_mean,339.666667,328,4,130150.0,1.379618,-0.035245,-0.481676,0,Segment 2,0.276875,0.50615,-0.4767,0.5719,0.09575,0.47475,0.4295,Positive,Positive,0.5719,0.25,0.25,0.5,0
7,CUST00003,2020-07-18,2023-02-06,4,14,130150,59,Female,North,Corporate,18,3,3,0.25,0.25,0.25,0.5,328,32537.5,0.222222,0.0,0.0,ip_gap_median,321.0,328,4,130150.0,1.379618,-0.035245,-0.481676,0,Segment 2,0.276875,0.50615,-0.4767,0.5719,0.09575,0.47475,0.4295,Positive,Positive,0.5719,0.25,0.25,0.5,0
8,CUST00003,2020-07-18,2023-02-06,4,14,130150,59,Female,North,Corporate,18,3,3,0.25,0.25,0.25,0.5,328,32537.5,0.222222,0.0,0.0,ip_gap_std,126.040999,328,4,130150.0,1.379618,-0.035245,-0.481676,0,Segment 2,0.276875,0.50615,-0.4767,0.5719,0.09575,0.47475,0.4295,Positive,Positive,0.5719,0.25,0.25,0.5,0
9,CUST00004,2020-03-26,2023-11-28,5,12,349000,29,Male,East,Consumer,3,4,2,0.6,0.0,0.4,0.6,33,69800.0,1.666667,0.0,0.0,ip_gap_mean,230.0,33,5,302000.0,-1.458376,0.489953,0.740391,1,Segment 1,0.30872,0.4201,-0.3089,0.5719,0.0858,0.5076,0.4066,Positive,Positive,0.4201,0.0,0.4,0.6,0


In [3]:
# Numeric features from Notebook 2 + Notebook 5
rfm_cols   = ["recency","frequency","monetary"]
text_stats = ["compound_mean","compound_median","compound_min","compound_max",
              "neg_mean","neu_mean","pos_mean","compound_last"]
text_shares = [c for c in Xy.columns if c.startswith("sent_share_")]  # e.g., sent_share_negative/neutral/positive

num_cols = [c for c in (rfm_cols + text_stats + text_shares) if c in Xy.columns]

# Categorical: cluster from Notebook 2
cat_cols = []
if "cluster" in Xy.columns:
    cat_cols.append("cluster")

# Building X / y
X_num = Xy[num_cols].copy().fillna(0.0)

if cat_cols:
    X_cat = pd.get_dummies(Xy[cat_cols].astype("category"), drop_first=False, prefix=cat_cols)
    X = pd.concat([X_num, X_cat], axis=1)
else:
    X = X_num

y = Xy["churn"].astype(int)

print(f"Numeric features ({len(num_cols)}):", num_cols)
if cat_cols:
    print("One-hot added:", list(X.columns[len(num_cols):]))
print("X shape:", X.shape, "| y shape:", y.shape)

Numeric features (17): ['recency', 'frequency', 'monetary', 'compound_mean', 'compound_median', 'compound_min', 'compound_max', 'neg_mean', 'neu_mean', 'pos_mean', 'compound_last', 'sent_share_negative_x', 'sent_share_neutral_x', 'sent_share_positive_x', 'sent_share_negative_y', 'sent_share_neutral_y', 'sent_share_positive_y']
One-hot added: ['cluster_0', 'cluster_1']
X shape: (5901, 19) | y shape: (5901,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RSEED
)

print("Train:", X_train.shape, "| Test:", X_test.shape)
print("y_train pos ratio:", y_train.mean().round(3), "| y_test pos ratio:", y_test.mean().round(3))

# Scaling numeric columns for LR/SVM
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled  = X_test.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])

Train: (4720, 19) | Test: (1181, 19)
y_train pos ratio: 0.25 | y_test pos ratio: 0.25


In [5]:
sm = SMOTE(random_state=RSEED, k_neighbors=5)

# Resampling for LR/SVM (using scaled features)
Xtr_lr_sm,  ytr_lr_sm  = sm.fit_resample(X_train_scaled, y_train)

# Resampling for tree models (using unscaled features)
Xtr_tree_sm, ytr_tree_sm = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After  SMOTE (LR/SVM):", np.bincount(ytr_lr_sm))
print("After  SMOTE (Trees): ", np.bincount(ytr_tree_sm))

Before SMOTE: [3539 1181]
After  SMOTE (LR/SVM): [3539 3539]
After  SMOTE (Trees):  [3539 3539]


In [6]:
models = {
    "Logistic": LogisticRegression(max_iter=2000, class_weight=None, random_state=RSEED),
    "SVM":      SVC(probability=True, class_weight=None, random_state=RSEED),
    "DecisionTree": DecisionTreeClassifier(max_depth=None, min_samples_leaf=5,
                                           random_state=RSEED),
    "RandomForest": RandomForestClassifier(n_estimators=400, min_samples_leaf=3,
                                           max_depth=None, n_jobs=-1, random_state=RSEED),
}
list(models.keys())

['Logistic', 'SVM', 'DecisionTree', 'RandomForest']

In [7]:
from imblearn.over_sampling import SMOTE

# SMOTE on the training data only
sm = SMOTE(random_state=RSEED, k_neighbors=5)

# For LR/SVM, using the *scaled* training features
Xtr_lr_sm, ytr_lr_sm = sm.fit_resample(X_train_scaled, y_train)

# For tree models using the *unscaled* training features
Xtr_tree_sm, ytr_tree_sm = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After  SMOTE (LR/SVM):", np.bincount(ytr_lr_sm))
print("After  SMOTE (Trees): ", np.bincount(ytr_tree_sm))

Before SMOTE: [3539 1181]
After  SMOTE (LR/SVM): [3539 3539]
After  SMOTE (Trees):  [3539 3539]


In [8]:
def eval_model(name, model, Xtr, ytr, Xte, yte):
    model.fit(Xtr, ytr)
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(Xte)[:,1]
    elif hasattr(model, "decision_function"):
        s = model.decision_function(Xte)
        p = (s - s.min()) / (s.max() - s.min() + 1e-9)
    else:
        p = model.predict(Xte)

    y_pred = (p >= 0.5).astype(int)

    roc = roc_auc_score(yte, p)
    pr  = average_precision_score(yte, p)

    print(f"\n{name} — ROC-AUC: {roc:.4f} | PR-AUC: {pr:.4f}")
    print(classification_report(yte, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(yte, y_pred))
    return roc

In [9]:
aucs_sm = {}
for name, clf in models.items():
    if name in ["Logistic", "SVM"]:
        aucs_sm[name] = eval_model(name + " + SMOTE", clf, Xtr_lr_sm, ytr_lr_sm, X_test_scaled, y_test)
    else:
        aucs_sm[name] = eval_model(name + " + SMOTE", clf, Xtr_tree_sm, ytr_tree_sm, X_test, y_test)

print("\nSMOTE AUCs:", {k: round(v,4) for k,v in aucs_sm.items()})
best_sm = max(aucs_sm, key=aucs_sm.get)
print(f"Best (SMOTE): {best_sm} | AUC={aucs_sm[best_sm]:.4f}")


Logistic + SMOTE — ROC-AUC: 0.5237 | PR-AUC: 0.2724
              precision    recall  f1-score   support

           0      0.769     0.514     0.616       886
           1      0.268     0.536     0.357       295

    accuracy                          0.519      1181
   macro avg      0.518     0.525     0.487      1181
weighted avg      0.644     0.519     0.551      1181

Confusion matrix:
 [[455 431]
 [137 158]]

SVM + SMOTE — ROC-AUC: 0.6738 | PR-AUC: 0.3779
              precision    recall  f1-score   support

           0      0.840     0.640     0.726       886
           1      0.370     0.634     0.467       295

    accuracy                          0.638      1181
   macro avg      0.605     0.637     0.597      1181
weighted avg      0.722     0.638     0.662      1181

Confusion matrix:
 [[567 319]
 [108 187]]

DecisionTree + SMOTE — ROC-AUC: 0.9174 | PR-AUC: 0.7919
              precision    recall  f1-score   support

           0      0.897     0.877     0.887      

In [10]:
aucs_base = {}
for name, clf in models.items():
    if name in ["Logistic", "SVM"]:
        aucs_base[name] = eval_model(name, clf, X_train_scaled, y_train, X_test_scaled, y_test)
    else:
        aucs_base[name] = eval_model(name, clf, X_train, y_train, X_test, y_test)

print("\nBASELINE (no SMOTE) AUCs:", {k: round(v,4) for k,v in aucs_base.items()})


Logistic — ROC-AUC: 0.5309 | PR-AUC: 0.2717
              precision    recall  f1-score   support

           0      0.750     1.000     0.857       886
           1      0.000     0.000     0.000       295

    accuracy                          0.750      1181
   macro avg      0.375     0.500     0.429      1181
weighted avg      0.563     0.750     0.643      1181

Confusion matrix:
 [[886   0]
 [295   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



SVM — ROC-AUC: 0.7344 | PR-AUC: 0.5253
              precision    recall  f1-score   support

           0      0.784     0.966     0.866       886
           1      0.663     0.200     0.307       295

    accuracy                          0.775      1181
   macro avg      0.723     0.583     0.586      1181
weighted avg      0.754     0.775     0.726      1181

Confusion matrix:
 [[856  30]
 [236  59]]

DecisionTree — ROC-AUC: 0.9168 | PR-AUC: 0.7737
              precision    recall  f1-score   support

           0      0.901     0.874     0.887       886
           1      0.652     0.712     0.681       295

    accuracy                          0.833      1181
   macro avg      0.777     0.793     0.784      1181
weighted avg      0.839     0.833     0.836      1181

Confusion matrix:
 [[774 112]
 [ 85 210]]

RandomForest — ROC-AUC: 0.9825 | PR-AUC: 0.9648
              precision    recall  f1-score   support

           0      0.906     0.995     0.949       886
           1   

In [11]:
best_name = best_sm.replace(" + SMOTE","")
best_clf = models[best_name]

if best_name in ["DecisionTree","RandomForest"]:
    if best_name == "RandomForest":
        best_clf.fit(Xtr_tree_sm, ytr_tree_sm)
    else:
        best_clf.fit(Xtr_tree_sm, ytr_tree_sm)

    importances = pd.Series(best_clf.feature_importances_, index=X.columns)
    display(importances.sort_values(ascending=False).head(20).to_frame("importance"))
else:
    print("Best model is not tree-based (no feature_importances_).")

Unnamed: 0,importance
monetary,0.129967
recency,0.120874
compound_last,0.075203
compound_mean,0.068112
neu_mean,0.066586
pos_mean,0.059013
compound_median,0.055646
compound_min,0.05192
neg_mean,0.050346
compound_max,0.050153


In [12]:
# --- SAVE: best churn model + bundle of ALL variants (BASE & SMOTE) ---
import pickle

# helpers
def _fresh(est):
    # new instance with same params (no pipelines)
    return est.__class__(**est.get_params())

def _fit(est, X, y):
    if hasattr(est, "n_features_in_"):  # already fitted
        return est
    est.fit(X, y)
    return est

# 1) Prepare training matrices for each family/variant
X_lr_base,  y_lr_base  = X_train_scaled, y_train         # LR/SVM baseline (scaled)
X_lr_sm,    y_lr_sm    = Xtr_lr_sm, ytr_lr_sm            # LR/SVM + SMOTE (scaled)

X_tree_base,y_tree_base= X_train, y_train                # Trees baseline (unscaled)
X_tree_sm,  y_tree_sm  = Xtr_tree_sm, ytr_tree_sm        # Trees + SMOTE (unscaled)

# 2) Collect your estimators from the `models` dict (with simple fallbacks)
algs = {}
if "models" in globals() and isinstance(models, dict):
    algs.update(models)
else:
    if "lr" in globals(): algs["Logistic"] = lr
    if "svm" in globals(): algs["SVM"] = svm
    if "dt" in globals(): algs["DecisionTree"] = dt
    if "rf" in globals(): algs["RandomForest"] = rf

# 3) Build all fitted variants
all_variants = {}

# LR & SVM (scaled)
for key in ["Logistic", "SVM"]:
    if key in algs:
        m_base = _fresh(algs[key])
        m_sm   = _fresh(algs[key])
        all_variants[f"{key}_BASE"]  = {"model": _fit(m_base, X_lr_base, y_lr_base),
                                        "uses_scaler": True,  "variant": "BASE"}
        all_variants[f"{key}_SMOTE"] = {"model": _fit(m_sm,   X_lr_sm,   y_lr_sm),
                                        "uses_scaler": True,  "variant": "SMOTE"}

# Trees (unscaled)
for key in ["DecisionTree", "RandomForest"]:
    if key in algs:
        m_base = _fresh(algs[key])
        m_sm   = _fresh(algs[key])
        all_variants[f"{key}_BASE"]  = {"model": _fit(m_base, X_tree_base, y_tree_base),
                                        "uses_scaler": False, "variant": "BASE"}
        all_variants[f"{key}_SMOTE"] = {"model": _fit(m_sm,   X_tree_sm,   y_tree_sm),
                                        "uses_scaler": False, "variant": "SMOTE"}

# 4) Best model artifact (same structure you already use)
best_name_core = best_sm.replace(" + SMOTE","")
best_uses_scaler = ("Logistic" in best_sm) or ("SVM" in best_sm)
best_key = f"{best_name_core}_{'SMOTE' if 'SMOTE' in best_sm else 'BASE'}"

if best_key not in all_variants:
    raise RuntimeError(f"Best model '{best_key}' not found among fitted variants.")

best_fitted = all_variants[best_key]["model"]

best_artifacts = {
    "best_model_name": best_sm,
    "feature_columns": list(X.columns),
    "numeric_columns": num_cols,
    "scaler": scaler if best_uses_scaler else None,
    "model": best_fitted
}

with open("../models/churn_model.pkl", "wb") as f:
    pickle.dump(best_artifacts, f)
print("Saved: ../models/churn_model.pkl  (best:", best_sm, ")")

# 5) Save ALL variants bundle (with schema once; scaler included for convenience)
all_bundle = {
    "feature_columns": list(X.columns),
    "numeric_columns": num_cols,
    "scaler": scaler,                    # only used when 'uses_scaler' == True
    "models": all_variants               # name -> {model, uses_scaler, variant}
}

with open("../models/churn_models_all.pkl", "wb") as f:
    pickle.dump(all_bundle, f)
print("Saved: ../models/churn_models_all.pkl  (variants:", ", ".join(sorted(all_variants.keys())), ")")

Saved: ../models/churn_model.pkl  (best: RandomForest )
Saved: ../models/churn_models_all.pkl  (variants: DecisionTree_BASE, DecisionTree_SMOTE, Logistic_BASE, Logistic_SMOTE, RandomForest_BASE, RandomForest_SMOTE, SVM_BASE, SVM_SMOTE )
