# Model Training - Customer Churn Prediction

This notebook covers model training, hyperparameter tuning, and model selection for customer churn prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

# Set MLflow tracking
mlflow.set_tracking_uri('./mlruns')
mlflow.set_experiment('churn_prediction_notebook')

print("Libraries imported successfully!")

In [None]:
# Load processed data
print("=== LOADING PROCESSED DATA ===")

try:
    X_train = pd.read_csv('../data/processed/X_train.csv')
    X_test = pd.read_csv('../data/processed/X_test.csv')
    y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
    y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()
    
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    print(f"Target distribution in training set:")
    print(y_train.value_counts(normalize=True))
    
except FileNotFoundError:
    print("❌ Processed data not found. Please run 02_feature_engineering.ipynb first.")
    print("Or run the data generation and preprocessing:")
    print("1. python scripts/generate_data.py")
    print("2. Run 02_feature_engineering.ipynb")

In [None]:
# Define models to compare
print("=== MODEL COMPARISON ===")

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'model': model,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Test ROC-AUC: {roc_auc:.4f}")

# Summary
print("\n=== MODEL COMPARISON SUMMARY ===")
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'CV_ROC_AUC_Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV_ROC_AUC_Std': [results[name]['cv_std'] for name in results.keys()],
    'Test_ROC_AUC': [results[name]['test_roc_auc'] for name in results.keys()]
}).sort_values('Test_ROC_AUC', ascending=False)

print(comparison_df)

In [None]:
# Visualize model comparison
print("=== MODEL PERFORMANCE VISUALIZATION ===")

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# ROC Curves
ax1 = axes[0, 0]
for name in results.keys():
    fpr, tpr, _ = roc_curve(y_test, results[name]['y_pred_proba'])
    auc = results[name]['test_roc_auc']
    ax1.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

ax1.plot([0, 1], [0, 1], 'k--', label='Random')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curves Comparison')
ax1.legend()
ax1.grid(True)

# Model Performance Bar Chart
ax2 = axes[0, 1]
models_names = comparison_df['Model']
test_scores = comparison_df['Test_ROC_AUC']
bars = ax2.bar(models_names, test_scores)
ax2.set_title('Test ROC-AUC Scores')
ax2.set_ylabel('ROC-AUC Score')
ax2.tick_params(axis='x', rotation=45)
ax2.set_ylim(0.5, 1.0)

# Add value labels on bars
for bar, score in zip(bars, test_scores):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{score:.3f}', ha='center', va='bottom')

# Cross-validation scores
ax3 = axes[1, 0]
cv_means = comparison_df['CV_ROC_AUC_Mean']
cv_stds = comparison_df['CV_ROC_AUC_Std']
ax3.errorbar(models_names, cv_means, yerr=cv_stds, fmt='o', capsize=5)
ax3.set_title('Cross-Validation ROC-AUC Scores')
ax3.set_ylabel('ROC-AUC Score')
ax3.tick_params(axis='x', rotation=45)
ax3.set_ylim(0.5, 1.0)
ax3.grid(True)

# Best model confusion matrix
best_model_name = comparison_df.iloc[0]['Model']
best_y_pred = results[best_model_name]['y_pred']
cm = confusion_matrix(y_test, best_y_pred)

ax4 = axes[1, 1]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax4)
ax4.set_title(f'Confusion Matrix - {best_model_name}')
ax4.set_xlabel('Predicted')
ax4.set_ylabel('Actual')

plt.tight_layout()
plt.show()

print(f"\n🏆 Best performing model: {best_model_name}")
print(f"Test ROC-AUC: {results[best_model_name]['test_roc_auc']:.4f}")

In [None]:
# Hyperparameter tuning for best model
print("=== HYPERPARAMETER TUNING ===")

best_model_name = comparison_df.iloc[0]['Model']
print(f"Tuning hyperparameters for: {best_model_name}")

# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

if best_model_name in param_grids:
    # Get the base model
    base_model = models[best_model_name]
    param_grid = param_grids[best_model_name]
    
    # Grid search
    print(f"Running GridSearchCV with {len(param_grid)} parameters...")
    grid_search = GridSearchCV(
        base_model, param_grid, cv=cv, scoring='roc_auc', 
        n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    
    print(f"\n🎯 Best parameters: {best_params}")
    print(f"Best CV ROC-AUC: {best_cv_score:.4f}")
    
    # Test the tuned model
    y_pred_tuned = best_model.predict(X_test)
    y_pred_proba_tuned = best_model.predict_proba(X_test)[:, 1]
    test_roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
    
    print(f"Tuned model Test ROC-AUC: {test_roc_auc_tuned:.4f}")
    print(f"Improvement: {test_roc_auc_tuned - results[best_model_name]['test_roc_auc']:.4f}")
    
else:
    print(f"No parameter grid defined for {best_model_name}")
    best_model = results[best_model_name]['model']
    best_params = {}
    y_pred_tuned = results[best_model_name]['y_pred']
    y_pred_proba_tuned = results[best_model_name]['y_pred_proba']
    test_roc_auc_tuned = results[best_model_name]['test_roc_auc']

In [None]:
# Detailed evaluation of the best model
print("=== DETAILED MODEL EVALUATION ===")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_tuned))

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Feature Importances:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 15 Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

elif hasattr(best_model, 'coef_'):
    # For linear models
    coef_importance = pd.DataFrame({
        'feature': X_train.columns,
        'coefficient': best_model.coef_[0]
    })
    coef_importance['abs_coefficient'] = abs(coef_importance['coefficient'])
    coef_importance = coef_importance.sort_values('abs_coefficient', ascending=False)
    
    print("\nTop 10 Feature Coefficients:")
    print(coef_importance.head(10))

In [None]:
# MLflow logging
print("=== MLFLOW EXPERIMENT LOGGING ===")

with mlflow.start_run(run_name=f"best_model_{best_model_name.lower().replace(' ', '_')}"):
    # Log parameters
    mlflow.log_param("model_type", best_model_name)
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    # Log metrics
    mlflow.log_metric("test_roc_auc", test_roc_auc_tuned)
    mlflow.log_metric("cv_roc_auc_mean", best_cv_score if 'best_cv_score' in locals() else results[best_model_name]['cv_mean'])
    
    # Additional metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    accuracy = accuracy_score(y_test, y_pred_tuned)
    precision = precision_score(y_test, y_pred_tuned)
    recall = recall_score(y_test, y_pred_tuned)
    f1 = f1_score(y_test, y_pred_tuned)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(best_model, "model")
    
    # Log artifacts
    if hasattr(best_model, 'feature_importances_'):
        feature_importance.to_csv("feature_importance.csv", index=False)
        mlflow.log_artifact("feature_importance.csv")
    
    print(f"✅ Experiment logged to MLflow")
    print(f"Run ID: {mlflow.active_run().info.run_id}")

In [None]:
# Save the best model
print("=== SAVING BEST MODEL ===")

import os
os.makedirs('../models/artifacts', exist_ok=True)

# Save model
model_filename = f'../models/artifacts/best_churn_model_{best_model_name.lower().replace(" ", "_")}.joblib'
joblib.dump(best_model, model_filename)

# Save model metadata
model_metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'parameters': best_params,
    'test_roc_auc': test_roc_auc_tuned,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'features': list(X_train.columns),
    'training_date': pd.Timestamp.now().isoformat()
}

import json
metadata_filename = f'../models/artifacts/model_metadata.json'
with open(metadata_filename, 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"✅ Model saved: {model_filename}")
print(f"✅ Metadata saved: {metadata_filename}")

# Final summary
print("\n=== TRAINING SUMMARY ===")
print(f"Best Model: {best_model_name}")
print(f"Test ROC-AUC: {test_roc_auc_tuned:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n🎉 Model training completed successfully!")