## Mount Colab Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pathlib
root = pathlib.Path('/content/drive/MyDrive/home/Research/Springboard/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

## Read Data

In [3]:
df = pd.read_csv( root / 'storedata_total.csv' )
df.head()

Unnamed: 0,custid,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city
0,6H6T6N,0,2012-09-28,2013-08-11 00:00:00,2013-08-11 00:00:00,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL
1,APCENR,1,2010-12-19,2011-04-01 00:00:00,2014-01-19 00:00:00,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL
2,7UP6MS,0,2010-10-03,2010-12-01 00:00:00,2011-07-06 00:00:00,0,0.0,0.0,33.58,0.059908,0,0,0,Wednesday,DEL
3,7ZEW8G,0,2010-10-22,2011-03-28 00:00:00,2011-03-28 00:00:00,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM
4,8V726M,1,2010-11-27,2010-11-29 00:00:00,2013-01-28 00:00:00,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM


## Preprocess

In [4]:
# Convert date columns to datetime
df['created'] = pd.to_datetime(df['created'], errors='coerce')
df['firstorder'] = pd.to_datetime(df['firstorder'], errors='coerce')
df['lastorder'] = pd.to_datetime(df['lastorder'], errors='coerce')

print( 'shape before drop:', df.shape )
df = df.dropna()
print( 'shape after drop:', df.shape )

shape before drop: (30801, 15)
shape after drop: (30747, 15)


## Feature Engineering

In [5]:
# daysSinceLastOrder --> customers who haven't ordered recently are at high risk
reference_date = df['lastorder'].max()  # Use max date as reference point
df['daysSinceLastOrder'] = (reference_date - df['lastorder']).dt.days

# daysToFirstOrder --> customers who order quickly after signup are more engaged
df['daysToFirstOrder'] = (df['firstorder'] - df['created']).dt.days

# ordersPerDay
customerLifetimeDays = (df['lastorder'] - df['created']).dt.days
df['ordersPerDay'] = df['ordfreq'] / (customerLifetimeDays + 1)  # +1 to avoid division by zero

# emailEngagementScore --> weighted combo of open and click rates
df['emailEngagementScore'] = (df['eopenrate'] * 0.6 + df['eclickrate'] * 0.4)

# serviceAdoptionScore --> customers using multiple services are more engaged
df['serviceAdoptionScore'] = df['paperless'] + df['refill'] + df['doorstep']

featureCols = [
    'daysSinceLastOrder',
    'daysToFirstOrder',
    'ordersPerDay',
    'emailEngagementScore',
    'serviceAdoptionScore', ]

targetCol = 'retained'



In [6]:
Features, Target = df[ featureCols ] , df[ targetCol ]

Target.value_counts()

Unnamed: 0_level_0,count
retained,Unnamed: 1_level_1
1,24433
0,6314


## Train

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    Features, Target,
    test_size=0.2,
    random_state=42,
    stratify=Target, # Ensure balanced split
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining set churn rate: {(1 - y_train.mean()):.2%}")
print(f"Test set churn rate: {(1 - y_test.mean()):.2%}")


Training set: 24597 samples
Test set: 6150 samples

Training set churn rate: 20.54%
Test set churn rate: 20.54%


In [8]:
rf_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("classifier", RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]
auc_roc_rf = roc_auc_score(y_test, y_proba_rf)

print(f"Random Forest - AUC ROC: {auc_roc_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Churned', 'Retained']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest - AUC ROC: 0.8468

Classification Report:
              precision    recall  f1-score   support

     Churned       0.62      0.55      0.58      1263
    Retained       0.89      0.91      0.90      4887

    accuracy                           0.84      6150
   macro avg       0.75      0.73      0.74      6150
weighted avg       0.83      0.84      0.83      6150


Confusion Matrix:
[[ 689  574]
 [ 429 4458]]


In [9]:
# Feature importance from Random Forest
rf_model = rf_pipeline.named_steps['classifier']
feature_importance = pd.DataFrame({
    'feature': featureCols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Random Forest):")
print(feature_importance.to_string(index=False))


Feature Importance (Random Forest):
             feature  importance
emailEngagementScore    0.560982
  daysSinceLastOrder    0.237317
    daysToFirstOrder    0.076174
        ordersPerDay    0.074760
serviceAdoptionScore    0.050768


## Fine Tuning

In [10]:
# Define the hyperparameter grid to search
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [2, 4]
}

# Create base pipeline for grid search
rf_base_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("classifier", RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    ))
])

# Perform grid search with cross-validation
# Using 'roc_auc' as scoring metric for classification
print("Performing grid search... This may take a few minutes.")
grid_rf = GridSearchCV(
    estimator=rf_base_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    refit=True,
    return_train_score=True,
    verbose=1
)

grid_rf.fit(X_train, y_train)

print("\nGrid search complete!")
print(f"Best parameters: {grid_rf.best_params_}")
print(f"Best cross-validation AUC ROC: {grid_rf.best_score_:.4f}")


Performing grid search... This may take a few minutes.
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Grid search complete!
Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best cross-validation AUC ROC: 0.8540


In [11]:
# Evaluate the best model from grid search on test data
best_rf_pipeline = grid_rf.best_estimator_

# Make predictions with the best model
y_pred_best = best_rf_pipeline.predict(X_test)
y_proba_best = best_rf_pipeline.predict_proba(X_test)[:, 1]

# Calculate AUC ROC for the best model
auc_roc_best = roc_auc_score(y_test, y_proba_best)

print("Best Model from Grid Search - Test Results:")
print(f"AUC ROC: {auc_roc_best:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['Churned', 'Retained']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))


Best Model from Grid Search - Test Results:
AUC ROC: 0.8472

Classification Report:
              precision    recall  f1-score   support

     Churned       0.62      0.52      0.56      1263
    Retained       0.88      0.92      0.90      4887

    accuracy                           0.84      6150
   macro avg       0.75      0.72      0.73      6150
weighted avg       0.83      0.84      0.83      6150


Confusion Matrix:
[[ 658  605]
 [ 409 4478]]


In [12]:
# Display grid search results
print("Grid Search Results Summary:")
print(f"Best parameters found: {grid_rf.best_params_}")
print(f"Best cross-validation AUC ROC: {grid_rf.best_score_:.4f}")
print(f"\nNumber of parameter combinations tested: {len(grid_rf.cv_results_['params'])}")

# Show top 5 parameter combinations
results_df = pd.DataFrame(grid_rf.cv_results_)
top_results = results_df.nlargest(5, 'mean_test_score')[
    ['param_classifier__n_estimators', 'param_classifier__max_depth',
     'param_classifier__min_samples_split', 'param_classifier__min_samples_leaf',
     'mean_test_score', 'std_test_score']
]
print("\nTop 5 Parameter Combinations:")
print(top_results.to_string(index=False))


Grid Search Results Summary:
Best parameters found: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best cross-validation AUC ROC: 0.8540

Number of parameter combinations tested: 24

Top 5 Parameter Combinations:
 param_classifier__n_estimators  param_classifier__max_depth  param_classifier__min_samples_split  param_classifier__min_samples_leaf  mean_test_score  std_test_score
                            100                           15                                   10                                   4         0.854011        0.008425
                            100                           15                                    5                                   4         0.853964        0.008507
                            100                           15                                   10                                   2         0.853494        0.007739
                             50