**IMPORT LIBRARIES + CREATE MODEL FOLDER**

In [2]:
import numpy as np
import os

# Create folder to save models
os.makedirs('model', exist_ok=True)

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import xgboost as xgb
import joblib

**LOAD DATASET**

In [4]:
data = pd.read_csv('bank.csv', sep=',')

# Check data
print(data.head())
print(data.shape)
print(data['deposit'].value_counts())

   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown     yes  
3    5   may       579         1     -1         0  unknown     yes  
4    5   may       673         2     -1         0  unknown     yes  
(11162, 17)
deposit
no     5873
yes    5289
Name: count, dtype: int64


**ENCODE CATEGORICAL FEATURES**

In [6]:
# Encode categorical columns
cat_cols = [
    'job', 'marital', 'education', 'default',
    'housing', 'loan', 'contact', 'month', 'poutcome'
]

label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

**ENCODE TARGET VARIABLE**

In [8]:
data['deposit'] = data['deposit'].map({'yes': 1, 'no': 0})

**SPLIT FEATURES & TARGET**

In [10]:
X = data.drop('deposit', axis=1)
y = data['deposit']

**TRAINâ€“TEST SPLIT**

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

**FEATURE SCALING**

In [14]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (8929, 16)
Test shape: (2233, 16)


**SAVE TEST DATA FOR STREAMLIT**

In [16]:
test_data = pd.concat(
    [
        pd.DataFrame(X_test, columns=X.columns),
        pd.Series(y_test.values, name='deposit')
    ],
    axis=1
)

test_data.to_csv('test_data.csv', index=False)

***MODEL TRAINING***

**MODEL 1: LOGISTIC REGRESSION**

In [18]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
mcc_lr = matthews_corrcoef(y_test, y_pred_lr)

print(f"Logistic: Acc {acc_lr:.4f}, AUC {auc_lr:.4f}, Prec {prec_lr:.4f}, Rec {rec_lr:.4f}, F1 {f1_lr:.4f}, MCC {mcc_lr:.4f}")
joblib.dump(model_lr, 'model/model_lr.pkl')

Logistic: Acc 0.7900, AUC 0.7886, Prec 0.7931, Rec 0.7582, F1 0.7753, MCC 0.5788


['model/model_lr.pkl']

**MODEL 2: DECISION TREE**

In [20]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)

acc_dt = accuracy_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_pred_dt)
prec_dt = precision_score(y_test, y_pred_dt)
rec_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
mcc_dt = matthews_corrcoef(y_test, y_pred_dt)

print(f"Decision Tree: Acc {acc_dt:.4f}, AUC {auc_dt:.4f}, "
      f"Prec {prec_dt:.4f}, Rec {rec_dt:.4f}, F1 {f1_dt:.4f}, MCC {mcc_dt:.4f}")

joblib.dump(model_dt, 'model/model_dt.pkl')

Decision Tree: Acc 0.7631, AUC 0.7621, Prec 0.7582, Rec 0.7404, F1 0.7492, MCC 0.5249


['model/model_dt.pkl']

**MODEL 3: KNN**

In [22]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)

y_pred_knn = model_knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn)
rec_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
mcc_knn = matthews_corrcoef(y_test, y_pred_knn)

print(f"KNN: Acc {acc_knn:.4f}, AUC {auc_knn:.4f}, "
      f"Prec {prec_knn:.4f}, Rec {rec_knn:.4f}, F1 {f1_knn:.4f}, MCC {mcc_knn:.4f}")

joblib.dump(model_knn, 'model/model_knn.pkl')

KNN: Acc 0.7734, AUC 0.7711, Prec 0.7877, Rec 0.7198, F1 0.7522, MCC 0.5461


['model/model_knn.pkl']

**MODEL 4: NAIVE BAYES**

In [24]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)

acc_nb = accuracy_score(y_test, y_pred_nb)
auc_nb = roc_auc_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb)
rec_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
mcc_nb = matthews_corrcoef(y_test, y_pred_nb)

print(f"Naive Bayes: Acc {acc_nb:.4f}, AUC {auc_nb:.4f}, "
      f"Prec {prec_nb:.4f}, Rec {rec_nb:.4f}, F1 {f1_nb:.4f}, MCC {mcc_nb:.4f}")

joblib.dump(model_nb, 'model/model_nb.pkl')

Naive Bayes: Acc 0.7465, AUC 0.7492, Prec 0.7042, Rec 0.8097, F1 0.7533, MCC 0.5004


['model/model_nb.pkl']

**MODEL 5: RANDOM FOREST**

In [26]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)

print(f"Random Forest: Acc {acc_rf:.4f}, AUC {auc_rf:.4f}, "
      f"Prec {prec_rf:.4f}, Rec {rec_rf:.4f}, F1 {f1_rf:.4f}, MCC {mcc_rf:.4f}")

joblib.dump(model_rf, 'model/model_rf.pkl')

Random Forest: Acc 0.8334, AUC 0.8343, Prec 0.8083, Rec 0.8538, F1 0.8304, MCC 0.6679


['model/model_rf.pkl']

**MODEL 6: XGBOOST**

In [28]:
model_xgb = xgb.XGBClassifier(
    eval_metric='logloss',
    random_state=42
)

model_xgb.fit(X_train, y_train)

y_pred_xgb = model_xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb)
rec_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)

print(f"XGBoost: Acc {acc_xgb:.4f}, AUC {auc_xgb:.4f}, "
      f"Prec {prec_xgb:.4f}, Rec {rec_xgb:.4f}, F1 {f1_xgb:.4f}, MCC {mcc_xgb:.4f}")

joblib.dump(model_xgb, 'model/model_xgb.pkl')

XGBoost: Acc 0.8424, AUC 0.8432, Prec 0.8178, Rec 0.8622, F1 0.8394, MCC 0.6858


['model/model_xgb.pkl']