# Feature 3.3 — Model Training (Churn + CLV)

This notebook trains baseline models for churn (classification) and CLV (regression) with a synthetic fallback. Replace synthetic loaders with Delta/Spark reads when available.

Artifacts: metrics CSVs, ROC/PR curves, calibration plot, feature importances, serialized models.

In [None]:
# Config
import os, sys, math, json, random, time, pathlib, warnings
import numpy as np, pandas as pd
from dataclasses import dataclass

SEED = int(os.getenv('SEED', '42'))
np.random.seed(SEED); random.seed(SEED)
OUT_DIR = os.getenv('OUT_DIR', 'artifacts/feature_3_3')
pathlib.Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
print('SEED', SEED, 'OUT_DIR', OUT_DIR)

In [None]:
# Synthetic fallback data generator (binary churn + positive CLV)
def make_synthetic(n=5000, p=20, churn_rate=0.18, seed=SEED):
    rng = np.random.default_rng(seed)
    X = rng.normal(size=(n,p))
    # churn logits depend on first 3 features
    logits = 0.6*X[:,0] - 0.4*X[:,1] + 0.8*X[:,2] - 0.2
    probs = 1/(1+np.exp(-logits))
    probs = 0.5*probs + 0.5*churn_rate  # mix-in base rate
    y_churn = rng.binomial(1, np.clip(probs, 0, 1))
    # CLV depends on features with noise, non-negative
    y_clv = np.maximum(0, 200 + 30*X[:,0] + 20*X[:,1] + 5*rng.normal(size=n))
    cols = [f'f{i}' for i in range(p)]
    df = pd.DataFrame(X, columns=cols)
    df['churn'] = y_churn
    df['clv'] = y_clv
    return df

df = make_synthetic()
df.head()

In [None]:
# Train/valid/test split using fixed seeds
from sklearn.model_selection import train_test_split
features = [c for c in df.columns if c.startswith('f')]
X = df[features].values
y_cls = df['churn'].values
y_reg = df['clv'].values
X_train, X_tmp, y_train_cls, y_tmp_cls = train_test_split(X, y_cls, test_size=0.4, random_state=SEED, stratify=y_cls)
X_valid, X_test, y_valid_cls, y_test_cls = train_test_split(X_tmp, y_tmp_cls, test_size=0.5, random_state=SEED, stratify=y_tmp_cls)
_, X_tmp2, _, y_tmp2 = train_test_split(X, y_reg, test_size=0.4, random_state=SEED)
X_valid_reg, X_test_reg, y_valid_reg, y_test_reg = train_test_split(X_tmp2, y_tmp2, test_size=0.5, random_state=SEED)
X_train_reg = X_train; y_train_reg = y_reg[:len(X_train)]  # simple alignment for demo
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
# Baselines
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_recall_curve, roc_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Churn baselines
p_base = y_train_cls.mean()
auc_base = roc_auc_score(y_test_cls, np.full_like(y_test_cls, p_base, dtype=float))
aupr_base = average_precision_score(y_test_cls, np.full_like(y_test_cls, p_base, dtype=float))

# CLV baseline (mean)
rmse_base = math.sqrt(mean_squared_error(y_test_reg, np.full_like(y_test_reg, y_train_reg.mean(), dtype=float)))
mae_base = mean_absolute_error(y_test_reg, np.full_like(y_test_reg, y_train_reg.mean(), dtype=float))
print('Baselines:', dict(auc_base=auc_base, aupr_base=aupr_base, rmse_base=rmse_base, mae_base=mae_base))

In [None]:
# Train churn Logistic Regression with calibration
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED)
cal_lr = CalibratedClassifierCV(lr, cv=5, method='isotonic')
cal_lr.fit(X_train, y_train_cls)

probs_test = cal_lr.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test_cls, probs_test)
aupr = average_precision_score(y_test_cls, probs_test)
acc = accuracy_score(y_test_cls, (probs_test>=0.5).astype(int))
print('Churn metrics:', dict(AUC=auc, AUCPR=aupr, ACC=acc))

In [None]:
# Bootstrap CI for AUC
rng = np.random.default_rng(SEED)
B=300
idx = np.arange(len(y_test_cls))
aucs=[]
for _ in range(B):
    s = rng.choice(idx, size=len(idx), replace=True)
    aucs.append(roc_auc_score(y_test_cls[s], probs_test[s]))
lo,hi = np.percentile(aucs,[2.5,97.5])
print('AUC 95% CI:', lo, hi)

In [None]:
# Plots: ROC/PR and reliability
import matplotlib.pyplot as plt
fpr,tpr,_ = roc_curve(y_test_cls, probs_test)
prec,rec,_ = precision_recall_curve(y_test_cls, probs_test)
plt.figure(); plt.plot(fpr,tpr); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC'); plt.savefig(f'{OUT_DIR}/roc.png', dpi=150); plt.close()
plt.figure(); plt.plot(rec,prec); plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('PR'); plt.savefig(f'{OUT_DIR}/pr.png', dpi=150); plt.close()
# Reliability diagram
bins = np.linspace(0,1,11)
digit = np.digitize(probs_test, bins)-1
avg_p = [probs_test[digit==i].mean() if np.any(digit==i) else np.nan for i in range(10)]
avg_y = [y_test_cls[digit==i].mean() if np.any(digit==i) else np.nan for i in range(10)]
plt.figure(); plt.plot(avg_p, avg_y, 'o-'); plt.plot([0,1],[0,1],'--'); plt.xlabel('Pred prob'); plt.ylabel('Observed rate'); plt.title('Reliability'); plt.savefig(f'{OUT_DIR}/reliability.png', dpi=150); plt.close()

In [None]:
# Train CLV RandomForest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, random_state=SEED, n_jobs=-1)
rf.fit(X_train_reg, y_train_reg)
pred_test = rf.predict(X_test_reg)
rmse = math.sqrt(mean_squared_error(y_test_reg, pred_test))
mae = mean_absolute_error(y_test_reg, pred_test)
r2 = r2_score(y_test_reg, pred_test)
print('CLV metrics:', dict(RMSE=rmse, MAE=mae, R2=r2))

In [None]:
# Permutation importance (fast approximation)
from sklearn.inspection import permutation_importance
pi_cls = permutation_importance(cal_lr, X_test, y_test_cls, n_repeats=5, random_state=SEED)
pi_reg = permutation_importance(rf, X_test_reg, y_test_reg, n_repeats=5, random_state=SEED)
imp_cls = pd.DataFrame({'feature':features,'importance':pi_cls.importances_mean}).sort_values('importance', ascending=False)
imp_reg = pd.DataFrame({'feature':features,'importance':pi_reg.importances_mean}).sort_values('importance', ascending=False)
imp_cls.to_csv(f'{OUT_DIR}/importance_churn.csv', index=False)
imp_reg.to_csv(f'{OUT_DIR}/importance_clv.csv', index=False)

In [None]:
# Save metrics and simple artifacts
pd.DataFrame([{
    'auc': auc, 'aupr': aupr, 'acc': acc,
    'auc_base': auc_base, 'aupr_base': aupr_base
}]).to_csv(f'{OUT_DIR}/churn_metrics.csv', index=False)
pd.DataFrame([{
    'rmse': rmse, 'mae': mae, 'r2': r2,
    'rmse_base': rmse_base, 'mae_base': mae_base
}]).to_csv(f'{OUT_DIR}/clv_metrics.csv', index=False)

In [None]:
# Optional: MLflow logging if available
try:
    import mlflow
    mlflow.set_experiment('feature_3_3_model_training')
    with mlflow.start_run(run_name='churn_lr_calibrated'):
        mlflow.log_params({'seed': SEED, 'model': 'LogisticRegression+Calibrated'})
        mlflow.log_metrics({'auc': float(auc), 'aupr': float(aupr), 'acc': float(acc)})
        mlflow.log_artifact(f'{OUT_DIR}/roc.png')
        mlflow.log_artifact(f'{OUT_DIR}/pr.png')
        mlflow.log_artifact(f'{OUT_DIR}/reliability.png')
        mlflow.log_artifact(f'{OUT_DIR}/importance_churn.csv')
    with mlflow.start_run(run_name='clv_rf'):
        mlflow.log_params({'seed': SEED, 'model': 'RandomForestRegressor', 'n_estimators': 200})
        mlflow.log_metrics({'rmse': float(rmse), 'mae': float(mae), 'r2': float(r2)})
        mlflow.log_artifact(f'{OUT_DIR}/importance_clv.csv')
except Exception as e:
    warnings.warn(f'MLflow logging skipped: {e}')

Next steps: tune 1–2 hyperparameters, compute segment-wise metrics, and prepare the scoring contract preview for Feature 3.4.