In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score, classification_report
from tqdm.auto import tqdm
import time
import shap
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    
    # Remove non-numeric columns and target-related columns that shouldn't be features
    exclude_cols = ['user_id', 'risk_category', 'primary_risk_drivers', 
                   'churn_score', 'estimated_days_to_churn', 'risk_velocity']
    
    # Separate features and target
    X = df.drop(exclude_cols + ['is_likely_churn'], axis=1)
    y = df['is_likely_churn']
    
    return X, y

In [3]:
X, y = load_and_prepare_data('data/events_with_churn_score.csv')

In [4]:
def train_and_evaluate_models(X, y):
    print("\n1. Starting model training and evaluation...")
    print("2. Splitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("3. Scaling features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("   ✓ Data preprocessing completed")
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }
    
    # Define hyperparameter grids
    param_grids = {
        'Random Forest': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5]
        },
        'Gradient Boosting': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }
    
    best_models = {}
    results = {}
    
    print("\n4. Training models with grid search...")
    for name, model in tqdm(models.items(), desc="Training models"):
        print(f"\n   Starting {name} classifier training...")
        start_time = time.time()
        # Perform grid search
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='roc_auc')
        grid_search.fit(X_train_scaled, y_train)
        
        # Save best model
        best_models[name] = grid_search.best_estimator_
        
        # Make predictions
        y_pred = grid_search.predict(X_test_scaled)
        y_pred_proba = grid_search.predict_proba(X_test_scaled)[:, 1]
        
        # Calculate metrics
        results[name] = {
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred),
            'best_params': grid_search.best_params_,
            'feature_importance': None  # Will be filled later
        }
        
        training_time = time.time() - start_time
        
        print(f"\n   ✓ {name} Training completed in {training_time:.2f} seconds")
        print(f"   Results for {name}:")
        print(f"   - Best parameters: {grid_search.best_params_}")
        print(f"   - AUC Score: {results[name]['auc']:.4f}")
        print(f"   - F1 Score: {results[name]['f1']:.4f}")
        print("\n   Classification Report:")
        print("   " + classification_report(y_test, y_pred).replace("\n", "\n   "))
    
    return best_models, results, X_train_scaled, X_test_scaled, y_train, y_test

In [5]:
best_models, results, X_train_scaled, X_test_scaled, y_train, y_test = train_and_evaluate_models(X, y)


1. Starting model training and evaluation...
2. Splitting data into train and test sets...
3. Scaling features...
   ✓ Data preprocessing completed

4. Training models with grid search...


Training models:   0%|          | 0/2 [00:00<?, ?it/s]


   Starting Random Forest classifier training...

   ✓ Random Forest Training completed in 118.82 seconds
   Results for Random Forest:
   - Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
   - AUC Score: 0.9973
   - F1 Score: 0.9779

   Classification Report:
                 precision    recall  f1-score   support
   
              0       0.98      0.96      0.97      1141
              1       0.97      0.98      0.98      1530
   
       accuracy                           0.97      2671
      macro avg       0.97      0.97      0.97      2671
   weighted avg       0.97      0.97      0.97      2671
   

   Starting Gradient Boosting classifier training...

   ✓ Gradient Boosting Training completed in 552.02 seconds
   Results for Gradient Boosting:
   - Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
   - AUC Score: 0.9979
   - F1 Score: 0.9781

   Classification Report:
                 precision    recall  f1-score   

In [6]:
def analyze_feature_importance(best_models, X, X_train_scaled, X_test_scaled, y_test):
    # Get feature names
    feature_names = X.columns
    
    # Random Forest feature importance
    rf_model = best_models['Random Forest']
    rf_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # SHAP values for Gradient Boosting
    gb_model = best_models['Gradient Boosting']
    explainer = shap.TreeExplainer(gb_model)
    shap_values = explainer.shap_values(X_test_scaled)
    
    if isinstance(shap_values, list):
        shap_values = shap_values[1]  # For binary classification
    
    # Calculate mean absolute SHAP values for each feature
    shap_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('importance', ascending=False)
    
    return rf_importance, shap_importance

In [9]:
rf_importance, shap_importance = analyze_feature_importance(
        best_models, X, X_train_scaled, X_test_scaled, y_test)

In [10]:
print("\nImportance of Features (Random Forest):")
print(rf_importance)


Importance of Features (Random Forest):
                 feature  importance
39               int_rec    0.122319
30         inactive_days    0.109559
0                ses_rec    0.102569
26            ses_wknd_r    0.054557
32     peak_activity_day    0.053440
28           time_to_int    0.031411
24            ses_hr_avg    0.030997
6                ses_n_r    0.028902
33        off_hours_rate    0.027546
7                  int_n    0.025637
1            ses_rec_avg    0.025485
40            int_rec_sd    0.025483
31      peak_activity_hr    0.022467
4               user_rec    0.021593
11               int_n_r    0.017133
14               rev_sum    0.017106
27           ses_len_avg    0.016985
13              tran_n_r    0.015278
45           rev_per_int    0.015073
8             view_count    0.014204
20         int_cat_n_avg    0.013455
25             ses_hr_sd    0.012913
15      rev_per_purchase    0.012901
12                tran_n    0.012866
38            ses_gap_sd    0.0123

In [11]:
print("\nImportance of Features (SHAP):")
print(shap_importance)


Importance of Features (SHAP):
                 feature  importance
26            ses_wknd_r    1.150781
0                ses_rec    0.994610
39               int_rec    0.920906
20         int_cat_n_avg    0.795104
30         inactive_days    0.779311
33        off_hours_rate    0.689454
13              tran_n_r    0.657306
25             ses_hr_sd    0.332848
7                  int_n    0.329086
15      rev_per_purchase    0.328420
6                ses_n_r    0.289325
11               int_n_r    0.287799
48       cross_cat_ratio    0.266146
1            ses_rec_avg    0.212648
32     peak_activity_day    0.197808
27           ses_len_avg    0.182911
47   pop_cat_consistency    0.161831
40            int_rec_sd    0.138429
24            ses_hr_avg    0.132634
43              int_skew    0.131183
14               rev_sum    0.130167
38            ses_gap_sd    0.127274
2             ses_rec_sd    0.126941
23             ses_mo_sd    0.122057
28           time_to_int    0.120682
19    