In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [37]:
# Load dataset
df = pd.read_csv("../data/bank-additional-full.csv", sep=";")

# Quick inspection
print("Shape:", df.shape)
print("\nTarget distribution:")
print(df["y"].value_counts())


Shape: (41188, 21)

Target distribution:
y
no     36548
yes     4640
Name: count, dtype: int64


In [38]:
# Encode target variable: yes -> 1, no -> 0
df["y"] = df["y"].map({"yes": 1, "no": 0})


In [39]:
# separate features and target
X = df.drop("y", axis=1)
y = df["y"]


In [40]:
# identify categorical and numerical features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)


Categorical Features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
Numerical Features: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']


In [41]:

#prerocessing pipelines for both numeric and categorical data
# Decisions applied here:

# One-Hot Encoding for categorical features

# Standard Scaling for numerical features

# "unknown" treated as a valid category

# drop='first' to avoid multicollinearity


numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(
    drop="first",
    handle_unknown="ignore"
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [42]:
# split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (32950, 20)
Test shape: (8238, 20)


In [43]:
# applying preprocessing pipelines to train and test data

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)


Processed train shape: (32950, 53)
Processed test shape: (8238, 53)


Preprocessing completed, goal of the above preprocessing was to get

X_train_processed

X_test_processed

y_train

y_test

preprocessor (to be reused in Streamlit)


Moving to model implementation

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib
import os


In [45]:
# creating directory to save models


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    
    "kNN": KNeighborsClassifier(n_neighbors=5),
    
    "Naive Bayes": GaussianNB(),
    
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        random_state=42
    ),
    
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}


In [46]:
# Train models

trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_processed, y_train)
    trained_models[name] = model

print("All models trained successfully.")


Training Logistic Regression...
Training Decision Tree...
Training kNN...
Training Naive Bayes...
Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



All models trained successfully.


In [47]:
# save model files


os.makedirs("model", exist_ok=True)

for name, model in trained_models.items():
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")

print("All models saved successfully.")


All models saved successfully.


Models trained now
Going ahead with evaluation metrics

In [48]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

import pandas as pd


In [49]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # Some models need predict_proba for AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = None

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    return accuracy, auc, precision, recall, f1, mcc


In [50]:
results = []

for name, model in trained_models.items():
    accuracy, auc, precision, recall, f1, mcc = evaluate_model(
        model,
        X_test_processed,
        y_test
    )
    
    results.append([
        name,
        accuracy,
        auc,
        precision,
        recall,
        f1,
        mcc
    ])


In [51]:
comparison_df = pd.DataFrame(
    results,
    columns=[
        "ML Model Name",
        "Accuracy",
        "AUC",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

print(comparison_df)


         ML Model Name  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.916242  0.942476   0.709507  0.434267  0.538770   
1        Decision Tree  0.894513  0.741114   0.531085  0.543103  0.537027   
2                  kNN  0.903496  0.876845   0.598227  0.436422  0.504673   
3          Naive Bayes  0.844016  0.849316   0.383865  0.635776  0.478702   
4        Random Forest  0.915878  0.947358   0.674074  0.490302  0.567686   
5              XGBoost  0.919277  0.950366   0.662546  0.577586  0.617156   

        MCC  
0  0.513732  
1  0.477549  
2  0.459572  
3  0.410839  
4  0.530501  
5  0.573958  


In [52]:
for name, model in trained_models.items():
    y_pred = model.predict(X_test_processed)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\nConfusion Matrix for {name}:")
    print(cm)



Confusion Matrix for Logistic Regression:
[[7145  165]
 [ 525  403]]

Confusion Matrix for Decision Tree:
[[6865  445]
 [ 424  504]]

Confusion Matrix for kNN:
[[7038  272]
 [ 523  405]]

Confusion Matrix for Naive Bayes:
[[6363  947]
 [ 338  590]]

Confusion Matrix for Random Forest:
[[7090  220]
 [ 473  455]]

Confusion Matrix for XGBoost:
[[7037  273]
 [ 392  536]]
