# Model Evaluation - Customer Churn Prediction

This notebook provides comprehensive evaluation of the trained churn prediction model including performance metrics, error analysis, and business impact assessment.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.calibration import calibration_curve
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load test data and trained model
print("=== LOADING DATA AND MODEL ===")

try:
    # Load test data
    X_test = pd.read_csv('../data/processed/X_test.csv')
    y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()
    
    # Load model metadata
    with open('../models/artifacts/model_metadata.json', 'r') as f:
        model_metadata = json.load(f)
    
    # Load the trained model
    model_name = model_metadata['model_name'].lower().replace(' ', '_')
    model_path = f'../models/artifacts/best_churn_model_{model_name}.joblib'
    model = joblib.load(model_path)
    
    print(f"✅ Loaded model: {model_metadata['model_name']}")
    print(f"✅ Test data shape: {X_test.shape}")
    print(f"✅ Model type: {model_metadata['model_type']}")
    
except FileNotFoundError as e:
    print(f"❌ Error loading files: {e}")
    print("Please run the previous notebooks first:")
    print("1. 02_feature_engineering.ipynb")
    print("2. 03_model_training.ipynb")

In [None]:
# Generate predictions
print("=== GENERATING PREDICTIONS ===")

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print(f"Predictions generated for {len(y_test)} samples")
print(f"Predicted churn rate: {y_pred.mean():.2%}")
print(f"Actual churn rate: {y_test.mean():.2%}")
print(f"Prediction probability range: {y_pred_proba.min():.3f} - {y_pred_proba.max():.3f}")

In [None]:
# Comprehensive performance metrics
print("=== PERFORMANCE METRICS ===")

# Basic metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)

# Create metrics dataframe
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'Avg Precision'],
    'Score': [accuracy, precision, recall, f1, roc_auc, avg_precision]
})

print("Model Performance Metrics:")
print(metrics_df.round(4))

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix Analysis
print("=== CONFUSION MATRIX ANALYSIS ===")

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")

# Calculate rates
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)  # Same as recall
false_positive_rate = fp / (fp + tn)
false_negative_rate = fn / (fn + tp)

print(f"\nSpecificity (True Negative Rate): {specificity:.4f}")
print(f"Sensitivity (True Positive Rate): {sensitivity:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn', 'Churn'], 
            yticklabels=['No Churn', 'Churn'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# ROC and Precision-Recall Curves
print("=== ROC AND PRECISION-RECALL CURVES ===")

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ROC Curve
fpr, tpr, roc_thresholds = roc_curve(y_test, y_pred_proba)
axes[0].plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.3f})')
axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve')
axes[0].legend()
axes[0].grid(True)

# Precision-Recall Curve
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
axes[1].plot(recall_curve, precision_curve, label=f'PR Curve (AP = {avg_precision:.3f})')
axes[1].axhline(y=y_test.mean(), color='k', linestyle='--', label='Baseline')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Find optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = roc_thresholds[optimal_idx]
print(f"\nOptimal threshold (Youden's J): {optimal_threshold:.3f}")
print(f"At optimal threshold - TPR: {tpr[optimal_idx]:.3f}, FPR: {fpr[optimal_idx]:.3f}")

In [None]:
# Threshold Analysis
print("=== THRESHOLD ANALYSIS ===")

thresholds = np.arange(0.1, 0.9, 0.1)
threshold_metrics = []

for threshold in thresholds:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    
    acc = accuracy_score(y_test, y_pred_thresh)
    prec = precision_score(y_test, y_pred_thresh)
    rec = recall_score(y_test, y_pred_thresh)
    f1_thresh = f1_score(y_test, y_pred_thresh)
    
    threshold_metrics.append({
        'Threshold': threshold,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1_thresh
    })

threshold_df = pd.DataFrame(threshold_metrics)
print("Metrics by Threshold:")
print(threshold_df.round(3))

# Plot threshold analysis
plt.figure(figsize=(12, 6))
plt.plot(threshold_df['Threshold'], threshold_df['Accuracy'], 'o-', label='Accuracy')
plt.plot(threshold_df['Threshold'], threshold_df['Precision'], 's-', label='Precision')
plt.plot(threshold_df['Threshold'], threshold_df['Recall'], '^-', label='Recall')
plt.plot(threshold_df['Threshold'], threshold_df['F1-Score'], 'd-', label='F1-Score')
plt.axvline(x=0.5, color='k', linestyle='--', alpha=0.5, label='Default (0.5)')
plt.axvline(x=optimal_threshold, color='r', linestyle='--', alpha=0.7, label=f'Optimal ({optimal_threshold:.2f})')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Model Performance by Classification Threshold')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Calibration Analysis
print("=== CALIBRATION ANALYSIS ===")

# Calibration curve
fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_pred_proba, n_bins=10)

plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Model")
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.title("Calibration Plot")
plt.legend()
plt.grid(True)

# Prediction distribution
plt.subplot(1, 2, 2)
plt.hist(y_pred_proba[y_test == 0], bins=20, alpha=0.7, label='No Churn', density=True)
plt.hist(y_pred_proba[y_test == 1], bins=20, alpha=0.7, label='Churn', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Prediction Probability Distribution')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Calibration metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

brier_score = brier_score_loss(y_test, y_pred_proba)
print(f"Brier Score (lower is better): {brier_score:.4f}")
print(f"Calibration Error: {np.mean(np.abs(fraction_of_positives - mean_predicted_value)):.4f}")

In [None]:
# Error Analysis
print("=== ERROR ANALYSIS ===")

# Create error analysis dataframe
error_df = X_test.copy()
error_df['actual'] = y_test
error_df['predicted'] = y_pred
error_df['predicted_proba'] = y_pred_proba
error_df['correct'] = (y_test == y_pred)

# False Positives (predicted churn but didn't churn)
false_positives = error_df[(error_df['actual'] == 0) & (error_df['predicted'] == 1)]
print(f"False Positives: {len(false_positives)} ({len(false_positives)/len(error_df)*100:.1f}%)")

# False Negatives (predicted no churn but did churn)
false_negatives = error_df[(error_df['actual'] == 1) & (error_df['predicted'] == 0)]
print(f"False Negatives: {len(false_negatives)} ({len(false_negatives)/len(error_df)*100:.1f}%)")

# Analyze characteristics of errors
if len(false_positives) > 0 and len(false_negatives) > 0:
    print("\nCharacteristics of False Positives (avg):")
    numerical_cols = error_df.select_dtypes(include=[np.number]).columns
    fp_stats = false_positives[numerical_cols].mean()
    overall_stats = error_df[error_df['actual'] == 0][numerical_cols].mean()
    
    comparison_fp = pd.DataFrame({
        'False_Positives': fp_stats,
        'All_Non_Churners': overall_stats,
        'Difference': fp_stats - overall_stats
    }).round(3)
    
    print(comparison_fp.head(10))
    
    print("\nCharacteristics of False Negatives (avg):")
    fn_stats = false_negatives[numerical_cols].mean()
    overall_churn_stats = error_df[error_df['actual'] == 1][numerical_cols].mean()
    
    comparison_fn = pd.DataFrame({
        'False_Negatives': fn_stats,
        'All_Churners': overall_churn_stats,
        'Difference': fn_stats - overall_churn_stats
    }).round(3)
    
    print(comparison_fn.head(10))

In [None]:
# Business Impact Analysis
print("=== BUSINESS IMPACT ANALYSIS ===")

# Define business parameters (example values)
avg_customer_value = 1200  # Average annual revenue per customer
retention_cost = 100       # Cost to retain a customer
acquisition_cost = 300     # Cost to acquire a new customer

# Calculate business metrics
total_customers = len(y_test)
actual_churners = sum(y_test)
predicted_churners = sum(y_pred)

# Confusion matrix values
true_positives = tp
false_positives = fp
false_negatives = fn
true_negatives = tn

# Business impact calculations
revenue_saved = true_positives * avg_customer_value  # Correctly identified churners
unnecessary_retention_cost = false_positives * retention_cost  # Wasted retention efforts
lost_revenue = false_negatives * avg_customer_value  # Missed churners
total_retention_cost = predicted_churners * retention_cost

# Net business impact
net_impact = revenue_saved - unnecessary_retention_cost - total_retention_cost
baseline_loss = actual_churners * avg_customer_value  # Loss without any intervention
improvement = baseline_loss - (lost_revenue + total_retention_cost)

print(f"Business Impact Analysis:")
print(f"Total customers evaluated: {total_customers:,}")
print(f"Actual churners: {actual_churners:,}")
print(f"Predicted churners: {predicted_churners:,}")
print(f"\nFinancial Impact:")
print(f"Revenue saved (TP): ${revenue_saved:,.2f}")
print(f"Unnecessary retention costs (FP): ${unnecessary_retention_cost:,.2f}")
print(f"Lost revenue (FN): ${lost_revenue:,.2f}")
print(f"Total retention costs: ${total_retention_cost:,.2f}")
print(f"\nNet business improvement: ${improvement:,.2f}")
print(f"ROI of churn prediction: {(improvement/total_retention_cost)*100:.1f}%")

# Create business impact visualization
impact_data = {
    'Revenue Saved\n(True Positives)': revenue_saved,
    'Lost Revenue\n(False Negatives)': -lost_revenue,
    'Wasted Costs\n(False Positives)': -unnecessary_retention_cost,
    'Retention Costs': -total_retention_cost
}

plt.figure(figsize=(12, 6))
colors = ['green', 'red', 'orange', 'blue']
bars = plt.bar(impact_data.keys(), impact_data.values(), color=colors, alpha=0.7)
plt.title('Business Impact Analysis')
plt.ylabel('Financial Impact ($)')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, impact_data.values()):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + (5000 if height > 0 else -15000),
             f'${value:,.0f}', ha='center', va='bottom' if height > 0 else 'top')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Model Evaluation Summary
print("=== MODEL EVALUATION SUMMARY ===")

evaluation_summary = {
    'model_name': model_metadata['model_name'],
    'model_type': model_metadata['model_type'],
    'test_samples': len(y_test),
    'actual_churn_rate': y_test.mean(),
    'predicted_churn_rate': y_pred.mean(),
    'performance_metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'average_precision': avg_precision,
        'brier_score': brier_score
    },
    'confusion_matrix': {
        'true_negatives': int(tn),
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'true_positives': int(tp)
    },
    'business_impact': {
        'revenue_saved': revenue_saved,
        'lost_revenue': lost_revenue,
        'unnecessary_costs': unnecessary_retention_cost,
        'total_retention_costs': total_retention_cost,
        'net_improvement': improvement,
        'roi_percentage': (improvement/total_retention_cost)*100
    },
    'optimal_threshold': optimal_threshold,
    'evaluation_date': pd.Timestamp.now().isoformat()
}

# Save evaluation results
import os
os.makedirs('../models/evaluation', exist_ok=True)

with open('../models/evaluation/evaluation_results.json', 'w') as f:
    json.dump(evaluation_summary, f, indent=2, default=str)

print("✅ Evaluation results saved to ../models/evaluation/evaluation_results.json")

# Final summary
print(f"\n🎯 FINAL EVALUATION SUMMARY")
print(f"Model: {model_metadata['model_name']}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Business ROI: {(improvement/total_retention_cost)*100:.1f}%")
print(f"\n🎉 Model evaluation completed successfully!")