# Churn Modelling - Complete Notebook

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/practical-ml-stack/practical-ml-stack.github.io/blob/main/notebooks/churn-modelling.ipynb)

This notebook contains the complete code for the Churn Modelling use case from [Practical ML Stack](https://practical-ml-stack.github.io/).

**Contents:**
1. Data Loading & Exploration
2. Feature Engineering
3. Model Training & Evaluation
4. Model Interpretation


In [None]:
# Setup - Install packages (uncomment if running in Colab)
# !pip install xgboost lightgbm shap --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print("Setup complete!")


## 1. Data Loading & Exploration


In [None]:
# Load data
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()


In [None]:
# Churn distribution
print("Churn Distribution:")
print(df['Churn'].value_counts())
print(f"\nChurn Rate: {(df['Churn'] == 'Yes').mean():.1%}")

# Fix TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)


In [None]:
# Churn by contract type
churn_by_contract = df.groupby('Contract')['Churn'].apply(
    lambda x: (x == 'Yes').mean() * 100
).sort_values(ascending=False)

print("Churn Rate by Contract Type:")
print(churn_by_contract.round(1))

fig, ax = plt.subplots(figsize=(8, 5))
churn_by_contract.plot(kind='bar', ax=ax, color='#3498db')
ax.set_title('Churn Rate by Contract Type')
ax.set_ylabel('Churn Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 2. Feature Engineering


In [None]:
def build_feature_matrix(df):
    """Build complete feature matrix from raw data."""
    features = pd.DataFrame()
    
    # Demographic features
    features['is_senior'] = df['SeniorCitizen']
    features['has_partner'] = (df['Partner'] == 'Yes').astype(int)
    features['has_dependents'] = (df['Dependents'] == 'Yes').astype(int)
    features['is_single'] = ((features['has_partner'] == 0) & (features['has_dependents'] == 0)).astype(int)
    
    # Service features
    features['has_phone'] = (df['PhoneService'] == 'Yes').astype(int)
    features['has_internet'] = (df['InternetService'] != 'No').astype(int)
    features['has_fiber'] = (df['InternetService'] == 'Fiber optic').astype(int)
    
    addon_services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    for service in addon_services:
        features[f'has_{service.lower()}'] = (df[service] == 'Yes').astype(int)
    features['num_addons'] = features[[f'has_{s.lower()}' for s in addon_services]].sum(axis=1)
    features['total_services'] = features['has_phone'] + features['has_internet'] + features['num_addons']
    
    # Account features
    features['contract_monthly'] = (df['Contract'] == 'Month-to-month').astype(int)
    features['contract_one_year'] = (df['Contract'] == 'One year').astype(int)
    features['contract_two_year'] = (df['Contract'] == 'Two year').astype(int)
    features['paperless_billing'] = (df['PaperlessBilling'] == 'Yes').astype(int)
    features['payment_electronic'] = (df['PaymentMethod'] == 'Electronic check').astype(int)
    features['tenure'] = df['tenure']
    features['is_new_customer'] = (df['tenure'] <= 6).astype(int)
    features['monthly_charges'] = df['MonthlyCharges']
    features['total_charges'] = df['TotalCharges']
    
    # Derived features
    features['avg_monthly_spend'] = np.where(df['tenure'] > 0, df['TotalCharges'] / df['tenure'], df['MonthlyCharges'])
    features['tenure_charges_ratio'] = np.where(df['MonthlyCharges'] > 0, df['tenure'] / df['MonthlyCharges'], 0)
    
    return features

# Build features
X = build_feature_matrix(df)
y = (df['Churn'] == 'Yes').astype(int)

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")


## 3. Model Training & Evaluation


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Churn rate (train): {y_train.mean():.1%}")
print(f"Churn rate (test): {y_test.mean():.1%}")


In [None]:
# Model 1: Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_scaled, y_train)

y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]
print(f"Logistic Regression ROC-AUC: {roc_auc_score(y_test, y_prob_lr):.3f}")


In [None]:
# Model 2: Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_prob_rf = rf.predict_proba(X_test)[:, 1]
print(f"Random Forest ROC-AUC: {roc_auc_score(y_test, y_prob_rf):.3f}")


In [None]:
# Model 3: XGBoost (if available)
try:
    from xgboost import XGBClassifier
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, scale_pos_weight=scale_pos_weight, 
                        random_state=42, use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    y_prob_xgb = xgb.predict_proba(X_test)[:, 1]
    print(f"XGBoost ROC-AUC: {roc_auc_score(y_test, y_prob_xgb):.3f}")
    best_prob = y_prob_xgb
except ImportError:
    print("XGBoost not installed. Using Random Forest as best model.")
    best_prob = y_prob_rf


In [None]:
# Feature importance
importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
top_15 = importance.head(15)
ax.barh(top_15['feature'], top_15['importance'], color='#3498db')
ax.set_xlabel('Importance')
ax.set_title('Top 15 Features - Random Forest')
ax.invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# Business Impact Analysis
test_df = pd.DataFrame({'actual': y_test.values, 'probability': best_prob}).sort_values('probability', ascending=False)
total_churners = y_test.sum()
baseline_rate = total_churners / len(y_test)

print(f"Total customers in test set: {len(y_test)}")
print(f"Total churners: {total_churners}")
print(f"Baseline churn rate: {baseline_rate:.1%}")
print("\n--- Targeting Analysis ---")

for pct in [10, 20, 30, 50]:
    n_customers = int(len(y_test) * pct / 100)
    top_n = test_df.head(n_customers)
    churners_caught = top_n['actual'].sum()
    catch_rate = churners_caught / total_churners
    precision = churners_caught / n_customers
    lift = precision / baseline_rate
    
    print(f"\nTop {pct}% ({n_customers} customers):")
    print(f"  Churners caught: {churners_caught} ({catch_rate:.1%} of all churners)")
    print(f"  Precision: {precision:.1%}, Lift: {lift:.1f}x")


## Summary

**Key findings:**
- Month-to-month contracts have 15x higher churn than 2-year contracts
- New customers (tenure < 6 months) are highest risk
- Electronic check payment method indicates higher churn risk
- More services = lower churn (customers are more invested)

**Model performance:** ~85% ROC-AUC with XGBoost

**Business impact:** 2x+ lift over random targeting

---

For the full tutorial with detailed explanations, visit [Practical ML Stack](https://practical-ml-stack.github.io/use-cases/churn-modelling/).
