# Modeling

In [74]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier
import shap
import joblib

In [55]:
data = pd.read_csv('preprocessed_df.csv')

## Data Preprocess
- We want to build a preprocessor for numerical and categorical features.
- Some of the models would prefer onehot encoded categorical features, others just label encoded so we build two different processors.
- We can think of a way of balancing the dataset such as synthetic augmentation or use models that are resilient to unbalanced dataset (Random Forests, Boosted models).

In [56]:
X, y = data.drop('Exited', axis=1), data['Exited']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Preprocessors

In [58]:
# Preprocessing pipelines for numerical and categorical features
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance (EUR)', 'NumberOfProducts', 'EstimatedSalary']
categorical_features = ['Country', 'Gender', 'Sentiment', 'EstimatedSalary_Category', 'CreditScore_Category', 'Age_Category', 'Balance (EUR)_Category']

preprocessor_onehot = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
        ], remainder='passthrough')

preprocessor_label = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OrdinalEncoder(), categorical_features)
        ], remainder='passthrough')

def create_pipeline(model, preprocessor, augment=False):
    if augment:
        return ImbPipeline(steps=[('preprocessor', preprocessor),
                                  ('smote', SMOTE(random_state=42)),
                                  ('classifier', model)])
    else:
        return Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', model)])


## Model Selection
- In this section we try different models with different parameter space.
- We create a method that allow us to introduce augmentation in the training data.
- We select the best performing model in terms of recall since we are interested in getting all the churning clients (clients on which we want to focus our business decisions)

In [68]:
def build_models(augment=False):
    models = [
        {
            'name': 'Logistic Regression',
            'estimator': create_pipeline(LogisticRegression(solver='liblinear'), preprocessor_onehot, augment),
            'params': {
                'classifier__C': [0.1, 1.0, 10],
                'classifier__penalty': ['l1', 'l2']
            }
        },
        {
            'name': 'Random Forest',
            'estimator': create_pipeline(RandomForestClassifier(random_state=42), preprocessor_label, augment),
            'params': {
                'classifier__n_estimators': [100, 200, 300],
                'classifier__max_depth': [None, 10, 20, 50],
                'classifier__min_samples_split': [2, 5, 10],
                'classifier__min_samples_leaf': [1, 2]
            }
        },
        {
            'name': 'XGBoost',
            'estimator': create_pipeline(XGBClassifier(random_state=42), preprocessor_label, augment),
            'params': {
                'classifier__n_estimators': [200, 300, 500],
                'classifier__learning_rate': [0.01, 0.1, 0.2, 0.25],
                'classifier__max_depth': [3, 5, 7],
                'classifier__gamma': [0, 0.1, 0.2]
            }
        },
        {
        'name': 'Support Vector Classifier',
        'estimator': create_pipeline(SVC(probability=True, random_state=42), preprocessor_onehot, augment),
        'params': {
            'classifier__C': [1.0, 10, 50],
            'classifier__kernel': ['rbf'],
            'classifier__gamma': ['scale', 'auto']
        }
        }
    ]
    return models

In [69]:
def model_scoring(models):
    best_models = []
    for model in models:
        print(f"Training {model['name']}...")
        grid_search = GridSearchCV(model['estimator'], model['params'], cv=5, n_jobs=-1, scoring='recall')
        grid_search.fit(X_train, y_train)
        best_models.append({
            'name': model['name'],
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_
        })
        print(f"Best parameters for {model['name']}: {grid_search.best_params_}")
        print(f"Best cross-validation recall score for {model['name']}: {grid_search.best_score_}")
    return best_models

In [76]:
def evaluate_models(best_models, X_test, y_test):
    for model in best_models:
        print(f"Evaluating {model['name']}...")
        y_pred = model['best_estimator'].predict(X_test)
        recall = recall_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        print(f"Test Recall for {model['name']}: {recall}")
        print(f"Confusion Matrix for {model['name']}:\n{conf_matrix}")
        print(f"Classification Report for {model['name']}:\n{class_report}")

In [70]:
models = build_models()
models_augmented = build_models(augment=True)

In [None]:
best_models = model_scoring(models)

In [71]:
best_models_aug = model_scoring(models_augmented)

Training Logistic Regression...
Best parameters for Logistic Regression: {'classifier__C': 0.1, 'classifier__penalty': 'l2'}
Best cross-validation recall score for Logistic Regression: 0.7061349693251534
Training Random Forest...
Best parameters for Random Forest: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 500}
Best cross-validation recall score for Random Forest: 0.592638036809816
Training XGBoost...
Best parameters for XGBoost: {'classifier__gamma': 0, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Best cross-validation recall score for XGBoost: 0.6306748466257669


In [77]:
evaluate_models(best_models, X_test, y_test)

Evaluating Logistic Regression...
Test Recall for Logistic Regression: 0.32432432432432434
Confusion Matrix for Logistic Regression:
[[1533   60]
 [ 275  132]]
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      1593
           1       0.69      0.32      0.44       407

    accuracy                           0.83      2000
   macro avg       0.77      0.64      0.67      2000
weighted avg       0.82      0.83      0.81      2000

Evaluating Random Forest...
Test Recall for Random Forest: 0.4742014742014742
Confusion Matrix for Random Forest:
[[1545   48]
 [ 214  193]]
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.80      0.47      0.60       407

    accuracy                           0.87      2000
   macro avg       0.84      0.72      0.76      2000
weighted avg

In [78]:
evaluate_models(best_models_aug, X_test, y_test)

Evaluating Logistic Regression...
Test Recall for Logistic Regression: 0.6928746928746928
Confusion Matrix for Logistic Regression:
[[1209  384]
 [ 125  282]]
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1593
           1       0.42      0.69      0.53       407

    accuracy                           0.75      2000
   macro avg       0.66      0.73      0.68      2000
weighted avg       0.81      0.75      0.76      2000

Evaluating Random Forest...
Test Recall for Random Forest: 0.6044226044226044
Confusion Matrix for Random Forest:
[[1477  116]
 [ 161  246]]
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      1593
           1       0.68      0.60      0.64       407

    accuracy                           0.86      2000
   macro avg       0.79      0.77      0.78      2000
weighted avg 

## Comments on model selection
- It seems that with not augmented training dataset the performance of the models are worse, in terms of positive class recall.
- The best model seems to be the logistic regression.