In [1]:
!pip install xgboost scikit-learn shap matplotlib pandas joblib
!pip install --upgrade shap --no-deps
!pip install numpy pandas scikit-learn matplotlib joblib xgboost




In [7]:
# cell 1 - imports & configuration
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
import shap

import xgboost as xgb   # <-- ADD THIS HERE

# paths
DATA_PATH = "/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv"
OUT_DIR = "/content/churn_project_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

RANDOM_STATE = 42


In [3]:
#load and clean
df = pd.read_csv(DATA_PATH)
print("Initial shape:", df.shape)

# Convert TotalCharges to numeric (it has spaces)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print("TotalCharges missing:", df['TotalCharges'].isna().sum())

# Drop rows with missing TotalCharges
df = df.dropna(subset=['TotalCharges']).reset_index(drop=True)

# Drop customerID
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

# Encode target
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

print("Cleaned shape:", df.shape)
print(df.dtypes)


Initial shape: (7043, 21)
TotalCharges missing: 11
Cleaned shape: (7032, 20)
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object


In [4]:
#split and columns
# separate X,y
X = df.drop(columns=['Churn'])
y = df['Churn']

# numeric & categorical lists
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=RANDOM_STATE)
print("Train/test sizes:", X_train.shape, X_test.shape)


Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Train/test sizes: (5625, 19) (1407, 19)


In [5]:
#preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])


In [6]:
#model & hyperparam tuning
xgb_clf = xgb.XGBClassifier(objective='binary:logistic',
                            eval_metric='logloss',
                            use_label_encoder=False,
                            random_state=RANDOM_STATE,
                            n_jobs=4)

pipe = Pipeline(steps=[('preproc', preprocessor), ('clf', xgb_clf)])

param_dist = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__subsample': [0.6, 0.8, 1.0],
    'clf__colsample_bytree': [0.5, 0.7, 1.0],
    'clf__reg_alpha': [0, 0.1, 1],
    'clf__reg_lambda': [1, 5, 10]
}

rs = RandomizedSearchCV(pipe, param_distributions=param_dist,
                        n_iter=20, scoring='roc_auc',
                        cv=3, random_state=RANDOM_STATE, n_jobs=1, verbose=2)

rs.fit(X_train, y_train)

print("Best CV AUC:", rs.best_score_)
print("Best params:", rs.best_params_)

best_model = rs.best_estimator_


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.8; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.8; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=300, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.6; total time=   2.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.6; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.8; total time=   0.5s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.1s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.2s
[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.2s
[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.1s
[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=10, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=5, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=1.0; total time=   0.7s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.2s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=200, clf__reg_alpha=0.1, clf__reg_lambda=5, clf__subsample=0.8; total time=   0.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.3s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.1s
[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.01, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0.1, clf__reg_lambda=1, clf__subsample=0.8; total time=   1.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.7, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=200, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=1.0; total time=   0.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.05, clf__max_depth=7, clf__n_estimators=300, clf__reg_alpha=0, clf__reg_lambda=1, clf__subsample=0.6; total time=   0.5s
[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.1s
[CV] END clf__colsample_bytree=0.5, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__reg_alpha=1, clf__reg_lambda=5, clf__subsample=0.6; total time=   0.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best CV AUC: 0.849308866466293
Best params: {'clf__subsample': 1.0, 'clf__reg_lambda': 5, 'clf__reg_alpha': 1, 'clf__n_estimators': 100, 'clf__max_depth': 3, 'clf__learning_rate': 0.1, 'clf__colsample_bytree': 0.7}


In [8]:
#evaluate and save
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test, y_proba)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test AUC: {:.4f}".format(auc))
print("Test Accuracy: {:.4f}".format(acc))
print("Test F1: {:.4f}".format(f1))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Save pipeline and metrics
joblib.dump(best_model, os.path.join(OUT_DIR, 'xgb_pipeline.joblib'))
with open(os.path.join(OUT_DIR, 'report.json'), 'w') as f:
    json.dump({
        'test_auc': auc,
        'test_accuracy': acc,
        'test_f1': f1,
        'best_params': {k.replace('clf__',''): v for k,v in rs.best_params_.items()}
    }, f, indent=2)

print("Saved pipeline and report to", OUT_DIR)


Test AUC: 0.8402
Test Accuracy: 0.7974
Test F1: 0.5803

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1033
           1       0.65      0.53      0.58       374

    accuracy                           0.80      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407

Saved pipeline and report to /content/churn_project_outputs


In [9]:
#feature names for SHAP
preproc = best_model.named_steps['preproc']
clf = best_model.named_steps['clf']

# numeric names
num_features = num_cols

# categorical names from OneHotEncoder
ohe = preproc.named_transformers_['cat']
try:
    cat_feature_names = ohe.get_feature_names_out(cat_cols).tolist()
except:
    cat_feature_names = []
    for i, col in enumerate(cat_cols):
        cats = ohe.categories_[i]
        cat_feature_names += [f"{col}_{c}" for c in cats]

feature_names = num_features + cat_feature_names
print("Total features after preprocessing:", len(feature_names))


Total features after preprocessing: 45


In [10]:
#SHAP explanation generation
import shap

# Transform a sample for speed (but we will use full test set for final)
X_train_trans = preproc.transform(X_train)
X_test_trans = preproc.transform(X_test)

# Create TreeExplainer for the xgboost booster
# Get the underlying Booster/estimator for faster SHAP interaction
# clf is an XGBClassifier sklearn wrapper; shap.TreeExplainer works with clf.get_booster() or clf
explainer = shap.TreeExplainer(clf)  # for XGBoost this is fine

# Compute SHAP values on test set (consider sampling to speed up)
sample_for_shap = pd.DataFrame(X_test_trans, columns=feature_names)
# If dataset is large, sample 200-1000 rows: shap.sample(sample_for_shap, 500)
shap_values = explainer.shap_values(sample_for_shap)

# Global summary plots
plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, sample_for_shap, plot_type='bar', show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'shap_summary_bar.png'))
plt.close()

plt.figure(figsize=(10,6))
shap.summary_plot(shap_values, sample_for_shap, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'shap_summary_beeswarm.png'))
plt.close()

print("Saved global SHAP summary plots to", OUT_DIR)


Saved global SHAP summary plots to /content/churn_project_outputs


In [11]:
#pick representative customers and save local SHAP plots
# compute probabilities for all test set
y_proba_all = best_model.predict_proba(X_test)[:,1]
probs = pd.Series(y_proba_all, index=X_test.index)

# choose high, low, marginal
high_candidates = probs[probs >= 0.8].index.tolist()
low_candidates = probs[probs <= 0.2].index.tolist()
marg_candidates = probs[(probs > 0.45) & (probs < 0.55)].index.tolist()

def pick(cands, default):
    return cands[0] if len(cands) else default

default_high = X_test.index[0]
default_low = X_test.index[-1]
default_marg = X_test.index[len(X_test)//2]

idx_high = pick(high_candidates, default_high)
idx_low = pick(low_candidates, default_low)
idx_marg = pick(marg_candidates, default_marg)

selected_indices = [idx_high, idx_marg, idx_low]
print("Selected indices:", selected_indices)

# For each selected user: compute SHAP for that single row
for idx in selected_indices:
    xrow = X_test.loc[[idx]]
    xrow_trans = preproc.transform(xrow)
    xrow_df = pd.DataFrame(xrow_trans, columns=feature_names)
    shap_vals_row = explainer.shap_values(xrow_df)  # shape (1, n_features)

    # Waterfall plot (PNG)
    plt.figure(figsize=(10,4))
    # shap.plots.waterfall expects an Explanation object for modern shap versions
    try:
        shap.plots.waterfall(shap.Explanation(values=shap_vals_row[0],
                                             base_values=explainer.expected_value,
                                             data=xrow_df.iloc[0].values,
                                             feature_names=feature_names),
                             show=False)
    except Exception as e:
        # fallback: use force_plot saved as PNG via matplotlib if waterfall not available
        print("waterfall plotting error (fallback):", e)
        shp = shap.TreeExplainer(clf).shap_values(xrow_df)
        shap.summary_plot(shp, xrow_df, show=False)  # small summary as fallback
    plt.tight_layout()
    png_path = os.path.join(OUT_DIR, f"local_shap_waterfall_idx_{idx}.png")
    plt.savefig(png_path)
    plt.close()

    # Force plot saved as HTML (interactive)
    force_html = os.path.join(OUT_DIR, f"local_shap_force_idx_{idx}.html")
    # create force plot and save as html
    f = shap.force_plot(explainer.expected_value, shap_vals_row[0], xrow_df.iloc[0], feature_names=feature_names)
    shap.save_html(force_html, f)
    print(f"Saved waterfall PNG and force HTML for idx {idx}")


Selected indices: [3676, 2034, 971]
Saved waterfall PNG and force HTML for idx 3676
Saved waterfall PNG and force HTML for idx 2034
Saved waterfall PNG and force HTML for idx 971


In [12]:
#save selected customers and create gitingest markdown
selected_df = X_test.loc[selected_indices].copy()
selected_df['churn_true'] = y_test.loc[selected_indices].values
selected_df['churn_prob'] = best_model.predict_proba(selected_df.drop(columns=['churn_true','churn_prob'], errors='ignore'))[:,1]

selected_df.to_csv(os.path.join(OUT_DIR, 'selected_customers.csv'), index=True)

# Prepare a template for submission (you will expand Analysis/Executive Summary)
md = f"""# Interpretable ML: SHAP Explanations for Telco Churn Prediction

**Dataset:** WA_Fn-UseC_-Telco-Customer-Churn.csv

## Model & Training
- Model: XGBoost via scikit-learn Pipeline
- Best hyperparameters (RandomizedSearchCV): { {k.replace('clf__',''): v for k,v in rs.best_params_.items()} }
- Test AUC: {auc:.4f}, Test Accuracy: {acc:.4f}, Test F1: {f1:.4f}

## Outputs in {OUT_DIR}
- Trained pipeline: xgb_pipeline.joblib
- Global SHAP plots: shap_summary_bar.png, shap_summary_beeswarm.png
- Local SHAP: local_shap_waterfall_idx_*.png and local_shap_force_idx_*.html
- Selected customers CSV: selected_customers.csv

## Instructions
Open the HTML force plots in a web browser to inspect interactive explanations. Include the PNGs and your textual interpretations in the submission box.

"""

with open(os.path.join(OUT_DIR,'gitingest_template.md'),'w') as f:
    f.write(md)

print("Saved Gitingest template and outputs to", OUT_DIR)
print("Files:", os.listdir(OUT_DIR))


Saved Gitingest template and outputs to /content/churn_project_outputs
Files: ['report.json', 'local_shap_waterfall_idx_971.png', 'xgb_pipeline.joblib', 'local_shap_waterfall_idx_3676.png', 'local_shap_force_idx_971.html', 'local_shap_waterfall_idx_2034.png', 'gitingest_template.md', 'shap_summary_bar.png', 'local_shap_force_idx_2034.html', 'local_shap_force_idx_3676.html', 'shap_summary_beeswarm.png', 'selected_customers.csv']


In [13]:
import os
os.listdir(OUT_DIR)


['report.json',
 'local_shap_waterfall_idx_971.png',
 'xgb_pipeline.joblib',
 'local_shap_waterfall_idx_3676.png',
 'local_shap_force_idx_971.html',
 'local_shap_waterfall_idx_2034.png',
 'gitingest_template.md',
 'shap_summary_bar.png',
 'local_shap_force_idx_2034.html',
 'local_shap_force_idx_3676.html',
 'shap_summary_beeswarm.png',
 'selected_customers.csv']