In [22]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report,
)


In [2]:
import sys
!{sys.executable} -m pip install xgboost



In [3]:
from xgboost import XGBClassifier
print("XGBoost imported OK")

XGBoost imported OK


In [5]:
import joblib
from xgboost import XGBClassifier  

In [7]:
#  CREATE MODEL FOLDER
# ============================
os.makedirs("model", exist_ok=True)

# ============================
#  LOAD AND PREPROCESS DATA
# ============================
# 1. Load dataset
df = pd.read_csv("Churn_Modelling.csv")

# 2. Drop unnecessary columns
df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)

# 3. Separate features and target
X = df.drop("Exited", axis=1)
y = df["Exited"]

# 4. Encode categorical variables
label_encoder_geo = LabelEncoder()
label_encoder_gender = LabelEncoder()

X["Geography"] = label_encoder_geo.fit_transform(X["Geography"])
X["Gender"] = label_encoder_gender.fit_transform(X["Gender"])

# 5. Train-test split (automatic; you DO NOT split manually)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
# 6. Scaling (for models that need it)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler and label encoders for Streamlit app
joblib.dump(scaler, "model/scaler.pkl")
joblib.dump(label_encoder_geo, "model/label_encoder_geo.pkl")
joblib.dump(label_encoder_gender, "model/label_encoder_gender.pkl")

# Save TEST data for Streamlit download / upload
test_df = X_test.copy()
test_df["Exited"] = y_test.values
test_df.to_csv("model/test_data.csv", index=False)

In [9]:
# ============================
#  METRICS HELPER
# ============================
def compute_metrics(y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_true, y_pred)
    return acc, auc, prec, rec, f1, mcc

metrics_summary = {}

In [10]:
# ============================
# 1. LOGISTIC REGRESSION
# ============================
log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
y_proba_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

metrics_summary["Logistic Regression"] = compute_metrics(y_test, y_pred_lr, y_proba_lr)

joblib.dump(log_reg, "model/logistic_regression.pkl")

['model/logistic_regression.pkl']

In [11]:
# ============================
# 2. DECISION TREE
# ============================
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:, 1]

metrics_summary["Decision Tree"] = compute_metrics(y_test, y_pred_dt, y_proba_dt)

joblib.dump(dt, "model/decision_tree.pkl")

['model/decision_tree.pkl']

In [12]:
# ============================
# 3. K-NEAREST NEIGHBOR
# ============================
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)
y_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

metrics_summary["KNN"] = compute_metrics(y_test, y_pred_knn, y_proba_knn)

joblib.dump(knn, "model/knn.pkl")


['model/knn.pkl']

In [13]:
# ============================
# 4. NAIVE BAYES (GAUSSIAN)
# ============================
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)
y_proba_nb = nb.predict_proba(X_test_scaled)[:, 1]

metrics_summary["Naive Bayes"] = compute_metrics(y_test, y_pred_nb, y_proba_nb)

joblib.dump(nb, "model/naive_bayes.pkl")

['model/naive_bayes.pkl']

In [28]:
# ============================
# 5. RANDOM FOREST (smaller model)
# ============================
rf = RandomForestClassifier(
    n_estimators=50,        # reduce trees to shrink .pkl size
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

metrics_summary["Random Forest"] = compute_metrics(y_test, y_pred_rf, y_proba_rf)

joblib.dump(rf, "model/random_forest.pkl")

['model/random_forest.pkl']

In [29]:
# ============================
# 6. XGBOOST
# ============================
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1,
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

metrics_summary["XGBoost"] = compute_metrics(y_test, y_pred_xgb, y_proba_xgb)

joblib.dump(xgb_model, "model/xgboost.pkl")

['model/xgboost.pkl']

In [30]:
# ============================
#  PRINT METRICS PER MODEL
# ============================
def print_metrics_block(model_name, values):
    acc, auc, prec, rec, f1, mcc = values
    print(f"\n=== {model_name} ===")
    print(f"Accuracy : {acc:.3f}")
    print(f"AUC      : {auc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1 Score : {f1:.3f}")
    print(f"MCC      : {mcc:.3f}")

for model_name, vals in metrics_summary.items():
    print_metrics_block(model_name, vals)


=== Logistic Regression ===
Accuracy : 0.805
AUC      : 0.771
Precision: 0.586
Recall   : 0.143
F1 Score : 0.229
MCC      : 0.217

=== Decision Tree ===
Accuracy : 0.776
AUC      : 0.665
Precision: 0.453
Recall   : 0.477
F1 Score : 0.465
MCC      : 0.324

=== KNN ===
Accuracy : 0.835
AUC      : 0.772
Precision: 0.662
Recall   : 0.386
F1 Score : 0.488
MCC      : 0.418

=== Naive Bayes ===
Accuracy : 0.829
AUC      : 0.815
Precision: 0.756
Recall   : 0.236
F1 Score : 0.360
MCC      : 0.357

=== Random Forest ===
Accuracy : 0.859
AUC      : 0.839
Precision: 0.755
Recall   : 0.455
F1 Score : 0.567
MCC      : 0.512

=== XGBoost ===
Accuracy : 0.867
AUC      : 0.859
Precision: 0.778
Recall   : 0.482
F1 Score : 0.595
MCC      : 0.542


In [31]:
# ============================
#  METRICS SUMMARY TABLE
# ============================
rows = []
for model_name, (acc, auc, prec, rec, f1, mcc) in metrics_summary.items():
    rows.append({
        "Model": model_name,
        "Accuracy": acc,
        "AUC": auc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "MCC": mcc,
    })

metrics_df = pd.DataFrame(rows)
metrics_df = metrics_df.set_index("Model")
metrics_df = metrics_df.round(3)

metrics_df

Unnamed: 0_level_0,Accuracy,AUC,Precision,Recall,F1,MCC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.805,0.771,0.586,0.143,0.229,0.217
Decision Tree,0.776,0.665,0.453,0.477,0.465,0.324
KNN,0.835,0.772,0.662,0.386,0.488,0.418
Naive Bayes,0.829,0.815,0.756,0.236,0.36,0.357
Random Forest,0.859,0.839,0.755,0.455,0.567,0.512
XGBoost,0.866,0.859,0.778,0.482,0.595,0.542


In [32]:
# ============================
#  CONFUSION MATRIX + REPORT (Random Forest)
# ============================
cm = confusion_matrix(y_test, y_pred_rf)
cm_df = pd.DataFrame(
    cm,
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"],
)

print("\n=== Confusion Matrix (Random Forest) ===")
display(cm_df)  # Jupyter nice table

print("\n=== Classification Report (Random Forest) ===")
print(classification_report(y_test, y_pred_rf, zero_division=0))


=== Confusion Matrix (Random Forest) ===


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1533,60
Actual 1,222,185



=== Classification Report (Random Forest) ===
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      1593
           1       0.76      0.45      0.57       407

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000



In [27]:
import os
print(os.listdir())          # should show 'model'
print(os.listdir("model"))   # should show 'test_data.csv'

['.ipynb_checkpoints', 'Churn_Modelling.csv', 'model', 'Untitled.ipynb']
['app.py', 'decision_tree.pkl', 'knn.pkl', 'label_encoder_gender.pkl', 'label_encoder_geo.pkl', 'logistic_regression.pkl', 'naive_bayes.pkl', 'random_forest.pkl', 'scaler.pkl', 'test_data.csv.csv', 'xgboost.pkl']
