In [1]:

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


In [2]:

# Load dataset
df = pd.read_csv('/Users/pyaekyitharchaw/Documents/CSCT Project/Data/botswana_bank_customer_churn.csv')

# Binning numeric variables
df['Income Band'] = pd.qcut(df['Income'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
df['Balance Band'] = pd.qcut(df['Balance'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
df['Outstanding Loans Band'] = pd.qcut(df['Outstanding Loans'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

# Label encoding
le_income = LabelEncoder()
df['Income Band Encoded'] = le_income.fit_transform(df['Income Band'])

le_balance = LabelEncoder()
df['Balance Band Encoded'] = le_balance.fit_transform(df['Balance Band'])

le_outstanding = LabelEncoder()
df['Outstanding Loans Band Encoded'] = le_outstanding.fit_transform(df['Outstanding Loans Band'])

# Save encoders and bin edges
income_bins = pd.qcut(df['Income'], q=5, retbins=True, duplicates='drop')[1]
balance_bins = pd.qcut(df['Balance'], q=5, retbins=True, duplicates='drop')[1]

joblib.dump(le_income, 'le_income.pkl')
joblib.dump(le_balance, 'le_balance.pkl')
joblib.dump(income_bins, 'income_bins.pkl')
joblib.dump(balance_bins, 'balance_bins.pkl')


['balance_bins.pkl']

In [3]:

# Select final features
features = [
    'Credit Score', 'Customer Tenure', 'Balance Band Encoded',
    'NumOfProducts', 'Outstanding Loans Band Encoded', 'Income Band Encoded',
    'Credit History Length', 'NumComplaints'
]
target = 'Churn Flag'

X = df[features]
y = df[target]


In [4]:

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [5]:

# Define models
models = {
    'XGBoost': XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'LGBM': LGBMClassifier(max_depth=5, num_leaves=20, class_weight='balanced', boosting_type='gbdt', objective='binary', random_state=42)
}

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models
for name, model in models.items():
    print(f"\n{name}")
    scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc')
    print(f"ROC AUC Scores: {scores}")
    print(f"Mean ROC AUC: {scores.mean():.4f}")



XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


ROC AUC Scores: [0.98853965 0.98924838 0.98921218 0.98945412 0.98881769]
Mean ROC AUC: 0.9891

Logistic Regression
ROC AUC Scores: [0.93316149 0.93447738 0.93347199 0.93412127 0.93533087]
Mean ROC AUC: 0.9341

Random Forest
ROC AUC Scores: [0.98784527 0.98878076 0.98858228 0.98918234 0.98813773]
Mean ROC AUC: 0.9885

LGBM
[LightGBM] [Info] Number of positive: 11276, number of negative: 81236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 92512, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 11275, number of negative: 81237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 92512, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 11275, number of negative: 81237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 92512, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 11275, number of negative: 81237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 92512, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 11275, number of negative: 81237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 92512, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
ROC AUC Scores: [0.98996334 0.99036446 0.99035839 0.99080269 0.99015888]
Mean ROC AUC: 0.9903




In [6]:

# Train final model (XGBoost example)
final_model = XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss', random_state=42)
final_model.fit(X_scaled, y)

# Save trained model
joblib.dump(final_model, 'churn_model_cv.pkl')
print("✅ Model and encoders saved successfully.")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Model and encoders saved successfully.
