# Churn Prediction Model
## Notebook 04: LightGBM/XGBoost Implementation

This notebook implements advanced churn prediction using gradient boosting algorithms.
Focus on business impact: identifying at-risk customers for proactive retention.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import lightgbm as lgb
import xgboost as xgb
import shap
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load and prepare data
df = pd.read_csv('../data/raw/master_dataset.csv')
print(f"Dataset shape: {df.shape}")

# Create enhanced features for churn prediction
df['risk_score'] = (
    (10 - df['satisfaction_score']) * 0.4 +
    df['num_complaints_12m'] * 0.3 +
    df['late_payments_12m'] * 0.3
)

df['usage_efficiency'] = (
    df['monthly_data_gb'] / (df['data_allowance_gb'] + 1e-6) +
    df['monthly_minutes'] / (df['minutes_allowance'] + 1e-6)
) / 2

df['value_score'] = pd.qcut(df['arpu'], q=5, labels=[1,2,3,4,5]).astype(int)

print(f"Target variable (churn) distribution:")
print(df['churned'].value_counts(normalize=True))

In [None]:
# Prepare features for modeling
feature_cols = [
    'age', 'annual_income', 'tenure_months', 'arpu', 'satisfaction_score',
    'monthly_data_gb', 'monthly_minutes', 'ott_usage_hours',
    'monthly_web_sessions', 'monthly_app_sessions', 'self_service_transactions',
    'num_complaints_12m', 'support_tickets_12m', 'late_payments_12m',
    'campaigns_exposed', 'total_clicks', 'total_conversions',
    'risk_score', 'usage_efficiency', 'value_score'
]

X = df[feature_cols]
y = df['churned']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training churn rate: {y_train.mean():.2%}")
print(f"Test churn rate: {y_test.mean():.2%}")

In [None]:
# Train LightGBM model
print("Training LightGBM Churn Prediction Model...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    verbose=-1
)

lgb_model.fit(X_train, y_train)

# Predictions
y_pred_lgb = lgb_model.predict(X_test)
y_prob_lgb = lgb_model.predict_proba(X_test)[:, 1]

# Evaluation
auc_lgb = roc_auc_score(y_test, y_prob_lgb)
print(f"\nLightGBM AUC Score: {auc_lgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgb))

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importances - Churn Prediction')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

In [None]:
# Business impact analysis
print("=== BUSINESS IMPACT ANALYSIS ===")

# High-risk customers (top 20% churn probability)
high_risk_threshold = np.percentile(y_prob_lgb, 80)
high_risk_customers = y_prob_lgb >= high_risk_threshold

print(f"High-risk threshold: {high_risk_threshold:.3f}")
print(f"High-risk customers identified: {high_risk_customers.sum():,}")

# Calculate potential savings
test_arpu = df.loc[X_test.index, 'arpu']
high_risk_revenue = test_arpu[high_risk_customers].sum()
avg_high_risk_arpu = test_arpu[high_risk_customers].mean()

# Assume 25% churn reduction through intervention
churn_reduction = 0.25
customers_saved = high_risk_customers.sum() * churn_reduction
monthly_savings = customers_saved * avg_high_risk_arpu
annual_savings = monthly_savings * 12

print(f"\nPotential Business Impact:")
print(f"Customers potentially saved: {customers_saved:.0f}")
print(f"Average ARPU of high-risk customers: ${avg_high_risk_arpu:.2f}")
print(f"Monthly revenue savings: ${monthly_savings:,.2f}")
print(f"Annual revenue savings: ${annual_savings:,.2f}")

# ROI calculation
implementation_cost = 50000
roi = (annual_savings - implementation_cost) / implementation_cost * 100
print(f"Estimated ROI: {roi:.1f}%")

In [None]:
# ROC Curve and model performance visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_lgb)
axes[0].plot(fpr, tpr, label=f'LightGBM (AUC = {auc_lgb:.3f})')
axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve - Churn Prediction')
axes[0].legend()
axes[0].grid(True)

# Churn probability distribution
axes[1].hist(y_prob_lgb[y_test==0], bins=50, alpha=0.7, label='Not Churned', density=True)
axes[1].hist(y_prob_lgb[y_test==1], bins=50, alpha=0.7, label='Churned', density=True)
axes[1].axvline(high_risk_threshold, color='red', linestyle='--', label='High Risk Threshold')
axes[1].set_xlabel('Churn Probability')
axes[1].set_ylabel('Density')
axes[1].set_title('Churn Probability Distribution')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Customer risk segmentation for business action
print("=== CUSTOMER RISK SEGMENTATION ===")

# Create risk segments
risk_thresholds = [0, 0.3, 0.7, 1.0]
risk_labels = ['Low Risk', 'Medium Risk', 'High Risk']

test_data = X_test.copy()
test_data['churn_probability'] = y_prob_lgb
test_data['actual_churn'] = y_test
test_data['arpu'] = df.loc[X_test.index, 'arpu']
test_data['risk_segment'] = pd.cut(y_prob_lgb, bins=risk_thresholds, labels=risk_labels)

# Risk segment analysis
risk_analysis = test_data.groupby('risk_segment').agg({
    'churn_probability': 'mean',
    'actual_churn': ['count', 'mean'],
    'arpu': 'mean'
}).round(3)

risk_analysis.columns = ['Avg Churn Prob', 'Customer Count', 'Actual Churn Rate', 'Avg ARPU']
print(risk_analysis)

# Recommended actions by segment
print("\n=== RECOMMENDED ACTIONS ===")
print("Low Risk Customers:")
print("- Focus on upselling and cross-selling")
print("- Maintain satisfaction through quality service")
print("- Monitor for any changes in behavior")

print("\nMedium Risk Customers:")
print("- Proactive engagement and check-ins")
print("- Address specific pain points")
print("- Offer personalized incentives")

print("\nHigh Risk Customers:")
print("- Immediate intervention required")
print("- Dedicated customer success manager")
print("- Retention campaigns and offers")
print("- Root cause analysis for dissatisfaction")

In [None]:
# Save model and results
import joblib
import os

# Create models directory
os.makedirs('../data/models', exist_ok=True)

# Save model
joblib.dump(lgb_model, '../data/models/churn_prediction_model.pkl')
print("Model saved to ../data/models/churn_prediction_model.pkl")

# Save predictions
predictions_df = pd.DataFrame({
    'customer_id': df.loc[X_test.index, 'customer_id'],
    'churn_probability': y_prob_lgb,
    'predicted_churn': y_pred_lgb,
    'actual_churn': y_test,
    'arpu': test_data['arpu'],
    'risk_segment': test_data['risk_segment']
})

predictions_df.to_csv('../data/processed/churn_predictions.csv', index=False)
print("Predictions saved to ../data/processed/churn_predictions.csv")

# Summary metrics
print(f"\n=== FINAL MODEL SUMMARY ===")
print(f"Model: LightGBM Gradient Boosting")
print(f"AUC Score: {auc_lgb:.4f}")
print(f"High-Risk Customers: {high_risk_customers.sum():,}")
print(f"Potential Annual Savings: ${annual_savings:,.2f}")
print(f"Estimated ROI: {roi:.1f}%")
print(f"Model Status: Ready for Production Deployment")