In [1]:
# Import required libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score,precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef
)
from sklearn.metrics import precision_recall_curve

import os
import joblib as jb
import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [2]:
# Load your dataset

data = pd.read_csv('data/bank-additional-full.csv', sep=';')

# Dataset information
dataset_name = "Bank Marketing"  # e.g., "Breast Cancer Wisconsin"
dataset_source = "UCI ML Repository"  # e.g., "UCI ML Repository"
n_samples = 41188      # Total number of rows
n_features = 20     # Number of features (excluding target)
problem_type = "binary_classification"  # "regression" or "binary_classification" or "multiclass_classification"

print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Samples: {n_samples}, Features: {n_features}")
print(f"Problem Type: {problem_type}")

Dataset: Bank Marketing
Source: UCI ML Repository
Samples: 41188, Features: 20
Problem Type: binary_classification


In [3]:
# Preprocess your data
# One-hot encode categorical_cols, drop originals

categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact','poutcome']
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=False)

# One-hot encode 'month' and 'day_of_week' with drop_first=True
data_encoded = pd.get_dummies(data_encoded, columns=['month', 'day_of_week'], drop_first=True)

# Map 'yes'/'no' to 1/0 in target variable 'y'
y_mapping = {'yes': 1,'no': 0 }
data_encoded['y'] = data_encoded['y'].map(y_mapping)

In [4]:
#move target variable y to the end of the dataframe
target = data_encoded.pop('y')
data_encoded['y'] = target

In [5]:
#count of class 0 and class 1 data_encoded['y']
print("Overall class distribution:")
print(data_encoded['y'].value_counts())

Overall class distribution:
y
0    36548
1     4640
Name: count, dtype: int64


In [6]:
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx','cons.conf.idx','euribor3m','nr.employed']

In [7]:
data_encoded.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,False,True,False,False,False,True,False,False,False,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,False,True,False,False,False,True,False,False,False,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,False,True,False,False,False,True,False,False,False,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,False,True,False,False,False,True,False,False,False,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,False,True,False,False,False,True,False,False,False,0


In [8]:
# Separate features and target
X = data_encoded.drop('y', axis=1)
y = data_encoded['y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [10]:
#definition for calculating evaluation metrics
def calc_metrics(y_true, y_pred,y_predproba=None):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    TN, FP, FN, TP = cm.ravel()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    if y_predproba is not None:
        auc_roc = roc_auc_score(y_true, y_predproba)
    else:
        auc_roc = None

    
    return {
        "Confusion Matrix": cm,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "MCC": mcc
    }

Train all 6 Models

In [11]:
random_state = 42

In [12]:
# Build Logistic Regression Model
log_model = LogisticRegression(max_iter=1000,class_weight='balanced',solver='liblinear', random_state=random_state)
log_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [13]:
# Hyperparameter tuning for Decision Tree
min_samples_leaf = 100
dt_max_depth = 5

# Build Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=dt_max_depth,min_samples_leaf=min_samples_leaf,class_weight='balanced',random_state=random_state)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,100
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [14]:
#hyperparameter tuning for kNN
n_neighbors = 10
weights = "distance"      # VERY IMPORTANT for probability quality

#Build kNN Model
knn_model = KNeighborsClassifier( n_neighbors=n_neighbors,weights=weights, metric="minkowski",p=2)
knn_model.fit(X_train, y_train)

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [15]:
#Build Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [16]:
#Tune hyperparameters for Random Forest Classifier
n_estimators = 300
rf_max_depth = 15
min_samples_leaf = 50

#Build Random Forest Model
rf_model = RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", 
                                  max_depth=rf_max_depth,min_samples_leaf=min_samples_leaf,random_state=random_state)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
#Tune hyperparameters for XGBoost Classifier
n_estimators = 300
xgb_max_depth = 5
learning_rate = 0.05
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

#Build Random Forest Model
xgb_model = XGBClassifier(
    n_estimators=n_estimators,    max_depth=xgb_max_depth,    scale_pos_weight=scale_pos_weight,    learning_rate=learning_rate,    eval_metric='logloss',
    objective='binary:logistic',    random_state=random_state)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


Predicting the test data and calculate the all 6 metrics

In [18]:
# Predict probabilities and apply threshold
y_predlogproba = log_model.predict_proba(X_test)[:, 1]
y_predlog      = (y_predlogproba >= 0.5).astype(int)

# Call calc_metrics function to calculate evaluation metrics
metricslog = calc_metrics(y_test, y_predlog,y_predlogproba)
print(f"Logistic Regression Model Evaluation Metrics:")
for metriclog, value in metricslog.items():
    print(f"{metriclog}: {value}")

Logistic Regression Model Evaluation Metrics:
Confusion Matrix: [[6283 1027]
 [  82  846]]
Accuracy: 0.8653799465889779
AUC-ROC: 0.9438200209915563
Precision: 0.45168179391350777
Recall: 0.9116379310344828
F1 Score: 0.6040699750089253
MCC: 0.5817024422460709


In [19]:
# predict probabilities for Decision Tree
y_preddtprob = dt_model.predict_proba(X_test)[:, 1]
y_preddt     = (y_preddtprob >= 0.5).astype(int)

# Call calc_metrics function to calculate evaluation metrics
metricsdt = calc_metrics(y_test, y_preddt,y_preddtprob)
print(f"Evaluation Metrics with Optimized Decision Tree: MaxDepth as {dt_max_depth}, MinSamplesLeaf as {min_samples_leaf}")
for metricdt, value in metricsdt.items():   
    print(f"{metricdt}: {value}")

Evaluation Metrics with Optimized Decision Tree: MaxDepth as 5, MinSamplesLeaf as 50
Confusion Matrix: [[6107 1203]
 [  84  844]]
Accuracy: 0.8437727603787327
AUC-ROC: 0.9400506804566253
Precision: 0.4123106985832926
Recall: 0.9094827586206896
F1 Score: 0.5673949579831933
MCC: 0.5450035114064352


In [20]:
#predict probabilities with threshold tuning
y_predknnproba = knn_model.predict_proba(X_test)[:, 1]
y_predknn = (y_predknnproba >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsknn = calc_metrics(y_test, y_predknn,y_predknnproba)
print(f"Evaluation Metrics with Optimized kNN: n_neighbors as {n_neighbors}, weights as {weights}")
for metricknn, value in metricsknn.items():   
    print(f"{metricknn}: {value}")

Evaluation Metrics with Optimized kNN: n_neighbors as 10, weights as distance
Confusion Matrix: [[7108  202]
 [ 535  393]]
Accuracy: 0.910536537994659
AUC-ROC: 0.9078061612576065
Precision: 0.6605042016806723
Recall: 0.4234913793103448
F1 Score: 0.5160866710439921
MCC: 0.48348353687288675


In [21]:
#predict probabilities with threshold tuning
y_prednbproba = nb_model.predict_proba(X_test)[:, 1]
y_prednb = (y_prednbproba >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsnb = calc_metrics(y_test, y_prednb,y_prednbproba)
print(f"Evaluation Metrics of Naive Bayes Model:")
for metricnb, value in metricsnb.items():   
    print(f"{metricnb}: {value}")

Evaluation Metrics of Naive Bayes Model:
Confusion Matrix: [[6226 1084]
 [ 319  609]]
Accuracy: 0.829691672736101
AUC-ROC: 0.8406065734232747
Precision: 0.35971647962197284
Recall: 0.65625
F1 Score: 0.46470812666921024
MCC: 0.39744668126546473


In [22]:
#predict probabilities for Random Forest
y_predrfprob = rf_model.predict_proba(X_test)[:, 1]
y_predrf     = (y_predrfprob >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsrf = calc_metrics(y_test, y_predrf,y_predrfprob)
print(f"Evaluation Metrics of Random Forest with max_depth={rf_max_depth}, min_samples_leaf={min_samples_leaf}, n_estimators={n_estimators}")
for metricrf, value in metricsrf.items():   
    print(f"{metricrf}: {value}")

Evaluation Metrics of Random Forest with max_depth=15, min_samples_leaf=50, n_estimators=300
Confusion Matrix: [[6038 1272]
 [  40  888]]
Accuracy: 0.8407380432143724
AUC-ROC: 0.944804000188688
Precision: 0.4111111111111111
Recall: 0.9568965517241379
F1 Score: 0.5751295336787565
MCC: 0.5627626683199373


In [23]:
#predict probabilities with threshold tuning
y_probxgb = xgb_model.predict_proba(X_test)[:, 1]
y_predxgb = (y_probxgb >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsxgb = calc_metrics(y_test, y_predxgb,y_probxgb)
print(f"Evaluation Metrics of XGBoost with max_depth={xgb_max_depth},  n_estimators={n_estimators} with learning_rate={learning_rate} ")
for metricxgb, value in metricsxgb.items():   
    print(f"{metricxgb}: {value}")

Evaluation Metrics of XGBoost with max_depth=5,  n_estimators=300 with learning_rate=0.05 
Confusion Matrix: [[6307 1003]
 [  61  867]]
Accuracy: 0.8708424374848264
AUC-ROC: 0.9536341631680739
Precision: 0.4636363636363636
Recall: 0.9342672413793104
F1 Score: 0.6197283774124375
MCC: 0.6015894865421084


Saving the Model for prediction

In [24]:
# For model saving
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

In [25]:
# store the logistic model as pkl using joblib
jb.dump(log_model, os.path.join(model_dir, 'logistic_model.pkl'))
print("Logistic model stored successfully as .pkl files using joblib.")

# store the decision tree model as pkl using joblib
jb.dump(dt_model, os.path.join(model_dir, 'decision_tree_model.pkl'))
print("Decision tree model stored successfully as .pkl files using joblib.")

# store the kNN model as pkl using joblib
jb.dump(knn_model, os.path.join(model_dir, 'knn_model.pkl'))    
print("kNN model stored successfully as .pkl files using joblib.")

# store the Naive Bayes model as pkl using joblib
jb.dump(nb_model, os.path.join(model_dir, 'nb_model.pkl'))
print("Naive Bayes model stored successfully as .pkl files using joblib.")

# store the Random Forest model as pkl using joblib
jb.dump(rf_model, os.path.join(model_dir, 'rf_model.pkl'))  
print("Random Forest model stored successfully as .pkl files using joblib.")

# store the XGBoost model as pkl using joblib
jb.dump(xgb_model, os.path.join(model_dir, 'xgboost_model.pkl'))
print("XGBoost model stored successfully as .pkl files using joblib.")


Logistic model stored successfully as .pkl files using joblib.
Decision tree model stored successfully as .pkl files using joblib.
kNN model stored successfully as .pkl files using joblib.
Naive Bayes model stored successfully as .pkl files using joblib.
Random Forest model stored successfully as .pkl files using joblib.
XGBoost model stored successfully as .pkl files using joblib.
