In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


In [4]:
#Load data

df = pd.read_csv(r"C:\Users\ryj81\OneDrive\Desktop\AI course\practical_application_lll\bank.csv", sep=";")
print("Shape:", df.shape)
print(df.head())

Shape: (4521, 17)
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


In [5]:
#cleaning and defining variable 

df["y"] = (df["y"].str.lower() == "yes").astype(int)

X = df.drop(columns=["y"])
y = df["y"]

print("\nTarget rate (y=1):", y.mean())



Target rate (y=1): 0.11523999115239991


In [6]:
#train and test data splits

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
# creating categories (numeric and categorical) and using pipeline, one hotEcoder


numeric_features = X.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numeric_transformer = Pipeline([
     ("scale", StandardScaler())])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)])


In [8]:
#nModels + small hyperparameter grids

models = {
    "KNN": (KNeighborsClassifier(), {
        "model__n_neighbors": [3,5,11,25],
        "model__weights": ["uniform","distance"]
    }),
    "LogReg": (LogisticRegression(max_iter=2000), {
        "model__C": [0.1, 1, 10]
    }),
    "DecisionTree": (DecisionTreeClassifier(random_state=42), {
        "model__max_depth": [3,5,10,None],
        "model__min_samples_split": [2,10,50]
    }),
    "SVM": (SVC(probability=True), {   
        "model__C": [0.5, 1, 5],
        "model__kernel": ["rbf", "linear"],
        "model__gamma": ["scale"]
    })
}

results = []



In [9]:
#Evaluate models performance


def evaluate(name, best_estimator):
    y_pred = best_estimator.predict(X_test)
    y_proba = best_estimator.predict_proba(X_test)[:,1]

    out = {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }
    return out, y_pred

In [10]:
#finding the best model
for name, (clf, grid) in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", clf)
    ])

    search = GridSearchCV(
        pipe, grid, cv=5, scoring="roc_auc", n_jobs=-1
    )
    search.fit(X_train, y_train)

    metrics_dict, y_pred = evaluate(name, search.best_estimator_)
    metrics_dict["best_params"] = search.best_params_
    results.append(metrics_dict)

    print(f"\n===== {name} =====")
    print("Best params:", search.best_params_)
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, zero_division=0))

results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


===== KNN =====
Best params: {'model__n_neighbors': 25, 'model__weights': 'distance'}
Confusion matrix:
 [[793   8]
 [ 92  12]]
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       801
           1       0.60      0.12      0.19       104

    accuracy                           0.89       905
   macro avg       0.75      0.55      0.57       905
weighted avg       0.86      0.89      0.85       905


===== LogReg =====
Best params: {'model__C': 0.1}
Confusion matrix:
 [[782  19]
 [ 79  25]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       801
           1       0.57      0.24      0.34       104

    accuracy                           0.89       905
   macro avg       0.74      0.61      0.64       905
weighted avg       0.87      0.89      0.87       905


===== DecisionTree =====
Best params: {'model__max_depth': 10, 'model__min_samples_split': 50}
Confusion matrix:
 [[776  25

Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,best_params
3,SVM,0.891713,0.583333,0.201923,0.3,0.892898,"{'model__C': 0.5, 'model__gamma': 'scale', 'mo..."
1,LogReg,0.891713,0.568182,0.240385,0.337838,0.890173,{'model__C': 0.1}
0,KNN,0.889503,0.6,0.115385,0.193548,0.858572,"{'model__n_neighbors': 25, 'model__weights': '..."
2,DecisionTree,0.893923,0.568966,0.317308,0.407407,0.857414,"{'model__max_depth': 10, 'model__min_samples_s..."
