# 07 - Model Evaluation

**Objective**: Comprehensive model evaluation on test set

**Metrics**:
- Confusion Matrix, Precision, Recall, F1
- ROC-AUC, PR-AUC curves
- Lift charts, threshold optimization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import (
    confusion_matrix, classification_report, 
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve,
    f1_score, precision_score, recall_score
)

plt.style.use('seaborn-v0_8-whitegrid')
print('Libraries loaded!')

In [None]:
# Paths
MODEL_PATH = Path('../data/06_models')
REPORTING_PATH = Path('../data/08_reporting')
REPORTING_PATH.mkdir(parents=True, exist_ok=True)

# Load models and data
champion = joblib.load(MODEL_PATH / 'champion_model.pkl')
lr_model = joblib.load(MODEL_PATH / 'logistic_regression.pkl')
xgb_model = joblib.load(MODEL_PATH / 'xgboost.pkl')
lgb_model = joblib.load(MODEL_PATH / 'lightgbm.pkl')
scaler = joblib.load(MODEL_PATH / 'scaler.pkl')

test_df = pd.read_csv(MODEL_PATH / 'test_set.csv')

with open(MODEL_PATH / 'feature_list.json', 'r') as f:
    FEATURES = json.load(f)

TARGET = 'Churn'
X_test = test_df[FEATURES]
y_test = test_df[TARGET]

print(f"Test samples: {len(X_test):,}")
print(f"Churn rate: {y_test.mean()*100:.2f}%")

## 1. Generate Predictions

In [None]:
# Scale for LR
X_test_scaled = scaler.transform(X_test)

# Predictions
models = {
    'Logistic Regression': (lr_model, X_test_scaled),
    'XGBoost': (xgb_model, X_test),
    'LightGBM': (lgb_model, X_test)
}

predictions = {}
for name, (model, X) in models.items():
    proba = model.predict_proba(X)[:, 1]
    predictions[name] = proba

print(" Predictions generated for all models")

## 2. ROC-AUC Curves

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

colors = ['#3498db', '#e74c3c', '#2ecc71']
for (name, proba), color in zip(predictions.items(), colors):
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    ax.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})', color=color, linewidth=2)

ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTING_PATH / 'roc_curves.png', dpi=150)
plt.show()

## 3. Precision-Recall Curves

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

for (name, proba), color in zip(predictions.items(), colors):
    precision, recall, _ = precision_recall_curve(y_test, proba)
    ap = average_precision_score(y_test, proba)
    ax.plot(recall, precision, label=f'{name} (AP={ap:.3f})', color=color, linewidth=2)

# Baseline (random)
baseline = y_test.mean()
ax.axhline(y=baseline, color='gray', linestyle='--', label=f'Baseline ({baseline:.3f})')

ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall Curves', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTING_PATH / 'pr_curves.png', dpi=150)
plt.show()

## 4. Champion Model Detailed Metrics

In [None]:
# Get champion predictions
with open(MODEL_PATH / 'training_metadata.json', 'r') as f:
    metadata = json.load(f)
champion_name = metadata['champion_model']

y_proba = predictions[champion_name]

print(f" CHAMPION MODEL: {champion_name}")
print("="*60)

In [None]:
# Threshold optimization
thresholds = np.arange(0.1, 0.9, 0.05)
threshold_results = []

for thresh in thresholds:
    y_pred = (y_proba >= thresh).astype(int)
    threshold_results.append({
        'Threshold': thresh,
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred)
    })

thresh_df = pd.DataFrame(threshold_results)

# Find optimal threshold
optimal_idx = thresh_df['F1'].idxmax()
optimal_threshold = thresh_df.loc[optimal_idx, 'Threshold']

print(f" THRESHOLD ANALYSIS:")
display(thresh_df[thresh_df['Threshold'].isin([0.3, 0.4, 0.45, 0.5, optimal_threshold])])
print(f"\n Optimal threshold (max F1): {optimal_threshold:.2f}")

In [None]:
# Plot threshold analysis
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(thresh_df['Threshold'], thresh_df['Precision'], label='Precision', marker='o')
ax.plot(thresh_df['Threshold'], thresh_df['Recall'], label='Recall', marker='s')
ax.plot(thresh_df['Threshold'], thresh_df['F1'], label='F1', marker='^', linewidth=2)
ax.axvline(x=optimal_threshold, color='red', linestyle='--', label=f'Optimal ({optimal_threshold:.2f})')
ax.set_xlabel('Threshold')
ax.set_ylabel('Score')
ax.set_title('Precision-Recall Trade-off', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(REPORTING_PATH / 'threshold_analysis.png', dpi=150)
plt.show()

In [None]:
# Confusion Matrix at optimal threshold
y_pred = (y_proba >= optimal_threshold).astype(int)

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Retained', 'Churned'],
            yticklabels=['Retained', 'Churned'])
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix (threshold={optimal_threshold:.2f})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(REPORTING_PATH / 'confusion_matrix.png', dpi=150)
plt.show()

print("\n CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, target_names=['Retained', 'Churned']))

## 5. Lift Chart

In [None]:
def calculate_lift(y_true, y_proba, n_bins=10):
    """Calculate lift by decile."""
    df = pd.DataFrame({'actual': y_true, 'proba': y_proba})
    df['decile'] = pd.qcut(df['proba'], n_bins, labels=False, duplicates='drop')
    
    lift_df = df.groupby('decile').agg(
        count=('actual', 'count'),
        churners=('actual', 'sum'),
        avg_proba=('proba', 'mean')
    ).reset_index()
    
    lift_df['churn_rate'] = lift_df['churners'] / lift_df['count'] * 100
    baseline_rate = y_true.mean() * 100
    lift_df['lift'] = lift_df['churn_rate'] / baseline_rate
    
    return lift_df.sort_values('decile', ascending=False)

lift_df = calculate_lift(y_test.values, y_proba)

print(" LIFT BY DECILE:")
display(lift_df)

In [None]:
# Plot lift chart
fig, ax = plt.subplots(figsize=(10, 6))

deciles = range(1, len(lift_df) + 1)
ax.bar(deciles, lift_df['lift'].values, color='#3498db', edgecolor='black')
ax.axhline(y=1.0, color='red', linestyle='--', label='Baseline')

ax.set_xlabel('Decile (1 = highest risk)')
ax.set_ylabel('Lift')
ax.set_title('Lift Chart by Decile', fontsize=14, fontweight='bold')
ax.legend()

# Add value labels
for i, v in enumerate(lift_df['lift'].values):
    ax.text(i+1, v+0.1, f'{v:.2f}', ha='center')

plt.tight_layout()
plt.savefig(REPORTING_PATH / 'lift_chart.png', dpi=150)
plt.show()

print(f"\n TOP DECILE LIFT: {lift_df['lift'].values[0]:.2f}x")

## 6. Precision @ Top 20%

In [None]:
# Precision at top 20%
n_top20 = int(len(y_test) * 0.20)
top20_idx = np.argsort(y_proba)[::-1][:n_top20]

precision_top20 = y_test.iloc[top20_idx].mean()

print(f" PRECISION @ TOP 20%: {precision_top20*100:.2f}%")
print(f"   (Baseline churn rate: {y_test.mean()*100:.2f}%)")
print(f"   Improvement: {(precision_top20 / y_test.mean()):.2f}x over random")

## 7. Final Model Comparison Table

In [None]:
# Compile final comparison
comparison = []

for name, proba in predictions.items():
    pred = (proba >= 0.45).astype(int)
    
    comparison.append({
        'Model': name,
        'ROC-AUC': roc_auc_score(y_test, proba),
        'PR-AUC': average_precision_score(y_test, proba),
        'Precision': precision_score(y_test, pred),
        'Recall': recall_score(y_test, pred),
        'F1': f1_score(y_test, pred)
    })

comparison_df = pd.DataFrame(comparison).round(4)

print(" FINAL MODEL COMPARISON (Test Set):")
print("="*80)
display(comparison_df)

# Save
comparison_df.to_csv(REPORTING_PATH / 'model_comparison.csv', index=False)
print(f"\n Saved: model_comparison.csv")

In [None]:
# Summary
print("\n" + "="*60)
print(" EVALUATION COMPLETE")
print("="*60)
print(f"\n Champion: {champion_name}")
print(f"   ROC-AUC: {comparison_df[comparison_df['Model']==champion_name]['ROC-AUC'].values[0]:.4f}")
print(f"   PR-AUC:  {comparison_df[comparison_df['Model']==champion_name]['PR-AUC'].values[0]:.4f}")
print(f"   Lift@Top10%: {lift_df['lift'].values[0]:.2f}x")
print(f"   Precision@Top20%: {precision_top20*100:.2f}%")
print("\n NEXT: Proceed to 08_Interpretation.ipynb")
print("="*60)