# Model Monitoring & Drift Detection

This notebook monitors model performance and detects data drift in production.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Model monitoring libraries loaded!")

In [None]:
# Load training and new data
print("=== LOADING DATA ===")

try:
    # Load training data (baseline)
    baseline_data = pd.read_csv('../data/processed/customer_churn_processed.csv')
    
    # Generate new data for monitoring (simulate production data)
    new_data = pd.read_csv('../data/raw/customer_churn.csv')
    
    print(f"Baseline data: {baseline_data.shape}")
    print(f"New data: {new_data.shape}")
    
except FileNotFoundError:
    print("❌ Data files not found. Run previous notebooks first.")

In [None]:
# Data drift detection
print("=== DATA DRIFT DETECTION ===")

numerical_cols = ['tenure', 'monthly_charges', 'total_charges']
drift_results = []

for col in numerical_cols:
    if col in baseline_data.columns and col in new_data.columns:
        # KS test for drift
        ks_stat, p_value = stats.ks_2samp(baseline_data[col], new_data[col])
        
        drift_results.append({
            'feature': col,
            'ks_statistic': ks_stat,
            'p_value': p_value,
            'drift_detected': p_value < 0.05
        })
        
        print(f"{col}: KS={ks_stat:.4f}, p-value={p_value:.4f}, Drift={'YES' if p_value < 0.05 else 'NO'}")

drift_df = pd.DataFrame(drift_results)
print(f"\nFeatures with drift: {drift_df['drift_detected'].sum()}")

In [None]:
# Performance monitoring visualization
print("=== PERFORMANCE MONITORING ===")

# Load model for predictions
try:
    model = joblib.load('../models/artifacts/best_churn_model_random_forest.joblib')
    
    # Make predictions on new data (if preprocessed)
    if 'churn' in new_data.columns:
        y_true = new_data['churn']
        # Note: In practice, you'd preprocess new_data same as training data
        print("Model performance monitoring ready!")
        
except FileNotFoundError:
    print("Model not found. Train model first.")

In [None]:
# Drift visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols[:4]):
    if col in baseline_data.columns and col in new_data.columns:
        axes[i].hist(baseline_data[col], alpha=0.7, label='Baseline', bins=30, density=True)
        axes[i].hist(new_data[col], alpha=0.7, label='New Data', bins=30, density=True)
        axes[i].set_title(f'{col} Distribution')
        axes[i].legend()
        axes[i].grid(True)

plt.tight_layout()
plt.show()

print("✅ Monitoring analysis complete!")