In [1]:
# --- CRISP-DM 1/6: BUSINESS UNDERSTANDING ---
"""
Goal: Predict customer churn for a telco to reduce attrition.
Primary metric: Recall (identify churners) with balanced Precision → F1.
Secondary: Business-friendly explanation (feature importances, SHAP-like via permutation).
Deployment target: A saved sklearn Pipeline (model.pkl) + threshold from validation.
"""

# --- Setup ---
import os, json, zipfile, urllib.request, io, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support, RocCurveDisplay
from sklearn.metrics import average_precision_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

os.makedirs("artifacts", exist_ok=True)
os.makedirs("data", exist_ok=True)
os.makedirs("artifacts/figs", exist_ok=True)

In [7]:
# --- CRISP-DM 2/6: DATA UNDERSTANDING ---
# Working link for IBM Telco Customer Churn dataset
import urllib.request, os, pandas as pd

URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
csv_path = "data/telco_churn.csv"

os.makedirs("data", exist_ok=True)

if not os.path.exists(csv_path):
    urllib.request.urlretrieve(URL, csv_path)

df = pd.read_csv(csv_path)
print(df.head(3))
print(df.shape, df['Churn'].value_counts(normalize=True))



   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract PaperlessBilling  \
0          No          No              No  Month-to-month              Yes   
1          No          No              No        One year               No   
2          No          No              No  Month-to-month              Yes   

      PaymentMethod MonthlyCharges  TotalCharges Churn  
0  E

In [8]:
# --- CRISP-DM 3/6: DATA PREPARATION ---
target = 'Churn'
y = df[target].map({'Yes':1,'No':0}).values
X = df.drop(columns=[target,'customerID'])

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()

numeric = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols)
    ]
)

# train/validation/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)


In [11]:
# --- CRISP-DM 4/6: MODELING ---
models = {
    "logreg": LogisticRegression(max_iter=200, n_jobs=None),
    "rf": RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced"),
    "xgb": XGBClassifier(
        n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9,
        eval_metric="logloss", random_state=42, scale_pos_weight=float((y_train==0).sum()/(y_train==1).sum())
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}
for name, est in models.items():
    pipe = Pipeline(steps=[("prep", preprocess), ("model", est)])
    auc = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc")
    ap = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="average_precision")
    cv_results[name] = {"roc_auc_mean": float(auc.mean()), "ap_mean": float(ap.mean())}
cv_results
# pick best by Average Precision (prioritizes recall on positives)
best_name = max(cv_results, key=lambda k: cv_results[k]["ap_mean"])
best_name, cv_results[best_name]

# Fit best on train, choose threshold by maximizing F1 on validation (split from train)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

best_est = models[best_name]
best_pipe = Pipeline(steps=[("prep", preprocess), ("model", best_est)])
best_pipe.fit(X_tr, y_tr)

val_proba = best_pipe.predict_proba(X_val)[:,1]
thresholds = np.linspace(0.1, 0.9, 81)
f1s = []
for t in thresholds:
    preds = (val_proba >= t).astype(int)
    p,r,f,_ = precision_recall_fscore_support(y_val, preds, average="binary", zero_division=0)
    f1s.append(f)
t_opt = float(thresholds[int(np.argmax(f1s))])
t_opt



0.35

In [15]:
# --- CRISP-DM 5/6: EVALUATION ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    average_precision_score,
    RocCurveDisplay,
)
from sklearn.inspection import permutation_importance
import json
import joblib

# --- Evaluate model performance on test set ---
test_proba = best_pipe.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= t_opt).astype(int)

print(f"\nClassification report @ threshold {t_opt:.2f}")
print(classification_report(y_test, test_pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, test_proba))
print("Average Precision:", average_precision_score(y_test, test_proba))

# --- ROC curve ---
plt.figure()
RocCurveDisplay.from_predictions(y_test, test_proba)
plt.title("ROC Curve - Test Set")
plt.tight_layout()
plt.savefig("artifacts/figs/roc_test.png")
plt.close()

# --- FIXED Permutation Importance (on dense transformed features) ---
print("\nComputing permutation importances correctly on transformed features...")

# Separate preprocessing and model
prep = best_pipe.named_steps["prep"]
model = best_pipe.named_steps["model"]

# Transform test data and convert to dense array
X_test_transformed = prep.transform(X_test)
if hasattr(X_test_transformed, "toarray"):
    X_test_transformed = X_test_transformed.toarray()

# Get correct feature names from transformers
cat_transformer = prep.named_transformers_["cat"]
cat_encoder = cat_transformer.named_steps["onehot"]
cat_feature_names = list(cat_encoder.get_feature_names_out(cat_cols))
num_feature_names = list(num_cols)
all_feature_names = num_feature_names + cat_feature_names

# Compute permutation importance on model
perm = permutation_importance(
    model,
    X_test_transformed,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring="f1"
)

# Create sorted importance dataframe
imp = pd.Series(perm.importances_mean, index=all_feature_names).sort_values(ascending=False)[:20]

# Plot top 20 important features
plt.figure(figsize=(7, 6))
imp[::-1].plot(kind="barh")
plt.title("Permutation Importance (Top 20 Features)")
plt.xlabel("Mean Importance (F1-based)")
plt.tight_layout()
plt.savefig("artifacts/figs/perm_importance.png")
plt.close()

print("Permutation importance saved → artifacts/figs/perm_importance.png")

# --- Save evaluation metrics and model metadata ---
metrics = {
    "cv_results": cv_results,
    "chosen_model": best_name,
    "threshold": float(t_opt),
    "roc_auc": float(roc_auc_score(y_test, test_proba)),
    "average_precision": float(average_precision_score(y_test, test_proba))
}

with open("artifacts/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Save model with preprocessing pipeline
joblib.dump({"pipeline": best_pipe, "threshold": t_opt}, "artifacts/model.pkl")

print("\n✅ Evaluation complete.")
print("Artifacts saved:")
print(" - artifacts/figs/roc_test.png")
print(" - artifacts/figs/perm_importance.png")
print(" - artifacts/metrics.json")
print(" - artifacts/model.pkl")



Classification report @ threshold 0.35
              precision    recall  f1-score   support

           0      0.879     0.793     0.834      1035
           1      0.549     0.698     0.615       374

    accuracy                          0.768      1409
   macro avg      0.714     0.746     0.724      1409
weighted avg      0.792     0.768     0.776      1409

ROC AUC: 0.8419902348291095
Average Precision: 0.6368106415326183

Computing permutation importances correctly on transformed features...
Permutation importance saved → artifacts/figs/perm_importance.png

✅ Evaluation complete.
Artifacts saved:
 - artifacts/figs/roc_test.png
 - artifacts/figs/perm_importance.png
 - artifacts/metrics.json
 - artifacts/model.pkl


<Figure size 640x480 with 0 Axes>

In [16]:
# --- CRISP-DM 6/6: DEPLOYMENT ARTIFACT ---
import joblib
joblib.dump({"pipeline": best_pipe, "threshold": t_opt}, "artifacts/model.pkl")
print("Saved artifacts/model.pkl and artifacts/metrics.json")


Saved artifacts/model.pkl and artifacts/metrics.json
