In [17]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [18]:
# Step 2: Load Dataset
df = pd.read_csv('/Users/pyaekyitharchaw/Documents/CSCT Project/Data/botswana_bank_customer_churn.csv')  # Replace with your file

df['Income Band'] = pd.qcut(df['Income'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
df['Balance Band'] = pd.qcut(df['Balance'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
df['Outstanding Loans Band'] = pd.qcut(df['Outstanding Loans'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

le_income = LabelEncoder()
df['Income Band Encoded'] = le_income.fit_transform(df['Income Band'])

le_balance = LabelEncoder()
df['Balance Band Encoded'] = le_balance.fit_transform(df['Balance Band'])

le_outstanding_loans = LabelEncoder()
df['Outstanding Loans Band Encoded'] = le_balance.fit_transform(df['Outstanding Loans Band'])


# Step 3: Select Features and Target
features = [
    'Credit Score', 'Customer Tenure', 'Balance Band Encoded',
    'NumOfProducts', 'Outstanding Loans Band Encoded', 'Income Band Encoded',
    'Credit History Length', 'NumComplaints'
]
target = 'Churn Flag' 

X = df[features]
y = df[target]

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115640 entries, 0 to 115639
Data columns (total 31 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   RowNumber                        115640 non-null  int64   
 1   CustomerId                       115640 non-null  object  
 2   Surname                          115640 non-null  object  
 3   First Name                       115640 non-null  object  
 4   Date of Birth                    115640 non-null  object  
 5   Gender                           115640 non-null  object  
 6   Marital Status                   115640 non-null  object  
 7   Number of Dependents             115640 non-null  int64   
 8   Occupation                       115640 non-null  object  
 9   Income                           115640 non-null  float64 
 10  Education Level                  115640 non-null  object  
 11  Address                          115640 non-null  ob

In [20]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [21]:
# Step 5: Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Step 6: Train Models and Evaluation

# XGB
xgb_model = XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print("XGBoost")
print(classification_report(y_test, xgb_preds))
print(confusion_matrix(y_test, xgb_preds))
print("ROC AUC:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1]))

# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
lr_preds = lr_model.predict(X_test_scaled)
print("\nLogistic Regression")
print(classification_report(y_test, lr_preds))
print(confusion_matrix(y_test, lr_preds))
print("ROC AUC:", roc_auc_score(y_test, lr_model.predict_proba(X_test_scaled)[:,1]))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("\nRandom Forest")
print(classification_report(y_test, rf_preds))
print(confusion_matrix(y_test, rf_preds))
print("ROC AUC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1]))

# LGBM
lgbm_model = LGBMClassifier(max_depth=5, num_leaves=20, class_weight='balanced', boosting_type='gbdt', objective='binary', random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)
print("\nLGBM")
print(classification_report(y_test, lgbm_preds))
print(confusion_matrix(y_test, lgbm_preds))
print("ROC AUC:", roc_auc_score(y_test, lgbm_model.predict_proba(X_test)[:,1]))



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
              precision    recall  f1-score   support

           0       0.99      0.94      0.97     20309
           1       0.70      0.96      0.81      2819

    accuracy                           0.95     23128
   macro avg       0.85      0.95      0.89     23128
weighted avg       0.96      0.95      0.95     23128

[[19165  1144]
 [  115  2704]]
ROC AUC: 0.9886653910806315
Logistic Regression
              precision    recall  f1-score   support

           0       0.98      0.84      0.90     20309
           1       0.43      0.88      0.58      2819

    accuracy                           0.84     23128
   macro avg       0.71      0.86      0.74     23128
weighted avg       0.91      0.84      0.86     23128

[[17032  3277]
 [  325  2494]]
ROC AUC: 0.9344818894305051

Random Forest
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     20309
           1       0.85      0.79      0.82      2819

    accuracy         

In [23]:
# Step 7: Export Model and Scaler
joblib.dump(xgb_model, 'xgb_model.pkl')
joblib.dump(lgbm_model, 'lgbm_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully!")

Model and scaler saved successfully!


In [24]:
# Get and save quantile bins
income_bins = pd.qcut(df['Income'], q=5, retbins=True, duplicates='drop')[1]
balance_bins = pd.qcut(df['Balance'], q=5, retbins=True, duplicates='drop')[1]

joblib.dump(income_bins, 'income_bins.pkl')
joblib.dump(balance_bins, 'balance_bins.pkl')

# Save fitted label encoders
joblib.dump(le_income, 'le_income.pkl')
joblib.dump(le_balance, 'le_balance.pkl')

print("✅ Saved income_bins, balance_bins, le_income, and le_balance.")


✅ Saved income_bins, balance_bins, le_income, and le_balance.
