# 03 — Churn Proxy Modeling (Random Forest + XGBoost)

This notebook builds a churn-risk proxy model using RFM features.
Because the dataset has only invoice dates and no future observation window,
we define churn as **low recency & low frequency** (bottom 20%).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 1) Load cleaned transactions (must include: customerid, invoiceno, invoicedate, sales)

df = pd.read_csv("data/processed/transactions_cleaned.csv")
df["invoicedate"] = pd.to_datetime(df["invoicedate"])


In [None]:
# 2) Choose churn window

N = 60  # days
T = df["invoicedate"].max() - pd.Timedelta(days=N)

history = df[df["invoicedate"] <= T].copy()
future  = df[(df["invoicedate"] > T) & (df["invoicedate"] <= T + pd.Timedelta(days=N))].copy()

snapshot_date = T + pd.Timedelta(days=1)

In [None]:
# 3) Build RFM features using ONLY history (no future leakage)

rfm = history.groupby("customerid").agg(
    recency=("invoicedate", lambda x: (snapshot_date - x.max()).days),
    frequency=("invoiceno", "nunique"),
    monetary=("sales", "sum")
)


In [None]:
# 4) Create TRUE churn label from the future window

future_buyers = set(future["customerid"].unique())
rfm["churn"] = (~rfm.index.isin(future_buyers)).astype(int)  # 1 = churned

# 5) Train/test split
X = rfm[["recency", "frequency", "monetary"]]
y = rfm["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [25]:
# 5) Train/test split
X = rfm[["recency", "frequency", "monetary"]]
y = rfm["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6) Model 1: Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)[:, 1]

print("RandomForest")
print(classification_report(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:", average_precision_score(y_test, proba))


RandomForest
              precision    recall  f1-score   support

           0       0.60      0.56      0.58       358
           1       0.62      0.65      0.63       389

    accuracy                           0.61       747
   macro avg       0.61      0.60      0.60       747
weighted avg       0.61      0.61      0.61       747

ROC-AUC: 0.6682871134982982
PR-AUC: 0.6633513623681517


In [26]:
# 7) Model 2: XGBoost
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
)
xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)
proba = xgb.predict_proba(X_test)[:, 1]

print("\nXGBoost")
print(classification_report(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:", average_precision_score(y_test, proba))



XGBoost
              precision    recall  f1-score   support

           0       0.67      0.57      0.62       358
           1       0.65      0.74      0.69       389

    accuracy                           0.66       747
   macro avg       0.66      0.66      0.65       747
weighted avg       0.66      0.66      0.66       747

ROC-AUC: 0.7210617397423561
PR-AUC: 0.7112887370335406


In [28]:
# Prediction (what the model outputs)

# For each customer, we predict the probability that they will churn — i.e., that they will make no purchase in the next 60 days after a chosen cutoff date.
# Formally:

# Target (y): churn_60d

# 1 = customer did not place any order in the 60-day future window after cutoff T

# 0 = customer did place at least one order in that window

# Inputs (X): RFM features computed from transactions before T

# recency = days since last purchase (as of T)

# frequency = number of invoices (before T)

# monetary = total spend (before T)

In [30]:
# This model predicts churn risk (60-day inactivity) for each customer using historical RFM features, 
# and outputs a churn probability to prioritize retention campaigns.