In [None]:
!pip install xgboost




In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef,
    confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install gdown




In [None]:
import gdown

file_id = "1quallSe4N5wrZNXVPsC9kYbBrwmkG4uF"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "bank-full.csv", quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1quallSe4N5wrZNXVPsC9kYbBrwmkG4uF
To: /content/bank-full.csv
100%|██████████| 4.61M/4.61M [00:00<00:00, 132MB/s]


'bank-full.csv'

In [None]:
import pandas as pd

df = pd.read_csv("bank-full.csv", sep=';')

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
df.columns = df.columns.str.strip()


In [None]:
df["y"].value_counts()


Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
no,39922
yes,5289


In [None]:
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].replace("unknown", df[col].mode()[0])


In [None]:
df["y"] = df["y"].map({"yes": 1, "no": 0})


In [None]:
df = pd.get_dummies(df, drop_first=True)


In [None]:
print(df.shape)


(45211, 40)


In [None]:
print("Target exists:", "y" in df.columns)


Target exists: True


In [None]:
X = df.drop("y", axis=1)
y = df["y"]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (36168, 39)
Test shape: (9043, 39)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)



models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=50, max_depth=5, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.898706,0.650742,0.629091,0.327032,0.430348,0.405446
1,Decision Tree,0.8976,0.666515,0.603125,0.364839,0.454653,0.417403
2,KNN,0.893841,0.643887,0.585366,0.31758,0.411765,0.379368
3,Naive Bayes,0.865974,0.696163,0.433506,0.47448,0.453069,0.377374
4,Random Forest,0.899701,0.602109,0.750831,0.213611,0.332597,0.36592
5,XGBoost,0.906779,0.70779,0.646658,0.448015,0.529313,0.48942


In [None]:
import os
import joblib

if not os.path.exists("model"):
    os.makedirs("model")

for name, model in models.items():
    filename = name.replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")

joblib.dump(scaler, "model/scaler.pkl")

print("Models saved successfully.")





Models saved successfully.


In [None]:
!ls model


Decision_Tree.pkl  Logistic_Regression.pkl  Random_Forest.pkl  XGBoost.pkl
KNN.pkl		   Naive_Bayes.pkl	    scaler.pkl


In [None]:
!zip -r model.zip model


updating: model/ (stored 0%)
updating: model/Logistic_Regression.pkl (deflated 23%)
updating: model/Naive_Bayes.pkl (deflated 13%)
updating: model/KNN.pkl (deflated 91%)
updating: model/XGBoost.pkl (deflated 70%)
updating: model/scaler.pkl (deflated 24%)
updating: model/Random_Forest.pkl (deflated 74%)
updating: model/Decision_Tree.pkl (deflated 76%)


In [None]:
from google.colab import files
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>