In [1]:
# Import required libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score,precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef
)
from sklearn.metrics import precision_recall_curve

import os
import joblib as jb
import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [2]:
# Load your dataset with error checking
import os

# Check if data file exists
data_path = 'data/bank-additional-full.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Data file not found at {data_path}. Please ensure the file exists.")

data = pd.read_csv(data_path, sep=';')

# Dataset information
dataset_name = "Bank Marketing"
dataset_source = "UCI ML Repository"
n_samples = 41188
n_features = 20
problem_type = "binary_classification"

print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Samples: {n_samples}, Features: {n_features}")
print(f"Problem Type: {problem_type}")

Dataset: Bank Marketing
Source: UCI ML Repository
Samples: 41188, Features: 20
Problem Type: binary_classification


In [3]:
# Separate train and test data
X = data.drop('y', axis=1)
y = data['y'].map({'yes': 1, 'no': 0})  # Convert target to binary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Store test data as single CSV files for later use in evaluation
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_data.csv', index=False)

#concat train data for preprocessing
train_data = pd.concat([X_train, y_train], axis=1)

In [5]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
12556,40,blue-collar,married,basic.9y,unknown,yes,no,telephone,jul,mon,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.96,5228.1,0
35451,31,admin.,married,university.degree,no,no,no,cellular,may,mon,...,4,999,0,nonexistent,-1.8,92.893,-46.2,1.244,5099.1,0
30592,59,retired,married,basic.4y,no,no,no,cellular,may,mon,...,6,999,1,failure,-1.8,92.893,-46.2,1.354,5099.1,0
17914,43,housemaid,divorced,basic.9y,no,yes,no,cellular,jul,tue,...,5,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,0
3315,39,admin.,single,high.school,unknown,no,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,0


In [6]:
# One-hot encode categorical_cols, drop originals
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact','poutcome']
data_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=False)

# One-hot encode 'month' and 'day_of_week' with drop_first=True
data_encoded = pd.get_dummies(data_encoded, columns=['month', 'day_of_week'], drop_first=True)

In [7]:
#move target variable y to the end of the dataframe
target = data_encoded.pop('y')
data_encoded['y'] = target

In [8]:
data_encoded.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y
12556,40,94,2,999,0,1.4,93.918,-42.7,4.96,5228.1,...,False,False,False,False,False,True,False,False,False,0
35451,31,116,4,999,0,-1.8,92.893,-46.2,1.244,5099.1,...,False,True,False,False,False,True,False,False,False,0
30592,59,13,6,999,1,-1.8,92.893,-46.2,1.354,5099.1,...,False,True,False,False,False,True,False,False,False,0
17914,43,94,5,999,0,1.4,93.918,-42.7,4.961,5228.1,...,False,False,False,False,False,False,False,True,False,0
3315,39,344,2,999,0,1.1,93.994,-36.4,4.86,5191.0,...,False,True,False,False,False,False,True,False,False,0


In [9]:
#count of class 0 and class 1 data_encoded['y']
print("Overall class distribution:")
print(data_encoded['y'].value_counts())

Overall class distribution:
y
0    29245
1     3705
Name: count, dtype: int64


In [10]:
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx','cons.conf.idx','euribor3m','nr.employed']

In [11]:
data_encoded.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y
12556,40,94,2,999,0,1.4,93.918,-42.7,4.96,5228.1,...,False,False,False,False,False,True,False,False,False,0
35451,31,116,4,999,0,-1.8,92.893,-46.2,1.244,5099.1,...,False,True,False,False,False,True,False,False,False,0
30592,59,13,6,999,1,-1.8,92.893,-46.2,1.354,5099.1,...,False,True,False,False,False,True,False,False,False,0
17914,43,94,5,999,0,1.4,93.918,-42.7,4.961,5228.1,...,False,False,False,False,False,False,False,True,False,0
3315,39,344,2,999,0,1.1,93.994,-36.4,4.86,5191.0,...,False,True,False,False,False,False,True,False,False,0


In [12]:
# Separate features and target
X_train = data_encoded.drop('y', axis=1)
y_train = data_encoded['y']


In [13]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

In [14]:
#definition for calculating evaluation metrics
def calc_metrics(y_true, y_pred,y_predproba=None):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    TN, FP, FN, TP = cm.ravel()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    if y_predproba is not None:
        auc_roc = roc_auc_score(y_true, y_predproba)
    else:
        auc_roc = None

    
    return {
        "Confusion Matrix": cm,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "MCC": mcc
    }

Train all 6 Models

In [15]:
random_state = 42

In [16]:
# Build Logistic Regression Model
log_model = LogisticRegression(max_iter=1000,class_weight='balanced',solver='liblinear', random_state=random_state)
log_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [17]:
# Hyperparameter tuning for Decision Tree
min_samples_leaf = 100
dt_max_depth = 5

# Build Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=dt_max_depth,min_samples_leaf=min_samples_leaf,class_weight='balanced',random_state=random_state)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,100
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [18]:
#hyperparameter tuning for kNN
n_neighbors = 10
weights = "distance"      # VERY IMPORTANT for probability quality

#Build kNN Model
knn_model = KNeighborsClassifier( n_neighbors=n_neighbors,weights=weights, metric="minkowski",p=2)
knn_model.fit(X_train, y_train)

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [19]:
#Build Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [20]:
#Tune hyperparameters for Random Forest Classifier
n_estimators = 300
rf_max_depth = 15
min_samples_leaf = 50

#Build Random Forest Model
rf_model = RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", 
                                  max_depth=rf_max_depth,min_samples_leaf=min_samples_leaf,random_state=random_state)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
#Tune hyperparameters for XGBoost Classifier
n_estimators = 300
xgb_max_depth = 5
learning_rate = 0.05
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

#Build Random Forest Model
xgb_model = XGBClassifier(
    n_estimators=n_estimators,    max_depth=xgb_max_depth,    scale_pos_weight=scale_pos_weight,    learning_rate=learning_rate,    eval_metric='logloss',
    objective='binary:logistic',    random_state=random_state)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [22]:
# Load test data
test_data = pd.read_csv('data/test_data.csv')
X_test = test_data.drop('y', axis=1)
y_test = test_data['y']

In [23]:
#preprocess test data

# One-hot encode categorical_cols, drop originals
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)

# One-hot encode 'month' and 'day_of_week' with drop_first=True
X_test = pd.get_dummies(X_test, columns=['month', 'day_of_week'], drop_first=True)

# Ensure test data has the same columns as training data
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

# Scale numerical features in test data using the same scaler fitted on training data
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [24]:
X_test.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
0,1.627357,0.436505,-0.5662,0.194661,1.677559,-1.19944,-1.18076,-1.229916,-1.340304,-0.939748,...,False,False,True,False,False,False,True,False,False,False
1,1.435707,0.105042,-0.206242,0.194661,-0.350902,0.647709,0.720307,0.890097,0.713674,0.330405,...,False,False,True,False,False,False,False,True,False,False
2,-0.672445,-0.792991,-0.5662,0.194661,1.677559,-1.19944,-1.18076,-1.229916,-1.332229,-0.939748,...,False,False,True,False,False,False,False,False,False,False
3,-0.38497,0.374837,0.513676,0.194661,-0.350902,0.838794,1.53357,-0.278074,0.775391,0.843165,...,True,False,False,False,False,False,False,False,False,False
4,-1.247395,-0.264963,-0.206242,0.194661,-0.350902,0.838794,0.58908,-0.472769,0.773084,0.843165,...,False,False,False,False,False,False,False,False,False,False


Predicting the test data and calculate the all 6 metrics

In [25]:
# Predict probabilities and apply threshold
y_predlogproba = log_model.predict_proba(X_test)[:, 1]
y_predlog      = (y_predlogproba >= 0.5).astype(int)

# Call calc_metrics function to calculate evaluation metrics
metricslog = calc_metrics(y_test, y_predlog,y_predlogproba)
print(f"Logistic Regression Model Evaluation Metrics:")
for metriclog, value in metricslog.items():
    print(f"{metriclog}: {value}")

Logistic Regression Model Evaluation Metrics:
Confusion Matrix: [[6263 1040]
 [ 100  835]]
Accuracy: 0.8616168973051711
AUC-ROC: 0.9382846841199975
Precision: 0.44533333333333336
Recall: 0.893048128342246
F1 Score: 0.594306049822064
MCC: 0.5678806636906815


In [26]:
# predict probabilities for Decision Tree
y_preddtprob = dt_model.predict_proba(X_test)[:, 1]
y_preddt     = (y_preddtprob >= 0.5).astype(int)

# Call calc_metrics function to calculate evaluation metrics
metricsdt = calc_metrics(y_test, y_preddt,y_preddtprob)
print(f"Evaluation Metrics with Optimized Decision Tree: MaxDepth as {dt_max_depth}, MinSamplesLeaf as {min_samples_leaf}")
for metricdt, value in metricsdt.items():   
    print(f"{metricdt}: {value}")

Evaluation Metrics with Optimized Decision Tree: MaxDepth as 5, MinSamplesLeaf as 50
Confusion Matrix: [[6032 1271]
 [  78  857]]
Accuracy: 0.8362466618111192
AUC-ROC: 0.9355613142646674
Precision: 0.40272556390977443
Recall: 0.9165775401069519
F1 Score: 0.5595821090434214
MCC: 0.538108168393929


In [27]:
#predict probabilities with threshold tuning
y_predknnproba = knn_model.predict_proba(X_test)[:, 1]
y_predknn = (y_predknnproba >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsknn = calc_metrics(y_test, y_predknn,y_predknnproba)
print(f"Evaluation Metrics with Optimized kNN: n_neighbors as {n_neighbors}, weights as {weights}")
for metricknn, value in metricsknn.items():   
    print(f"{metricknn}: {value}")

Evaluation Metrics with Optimized kNN: n_neighbors as 10, weights as distance
Confusion Matrix: [[7053  250]
 [ 550  385]]
Accuracy: 0.902889050740471
AUC-ROC: 0.9046999218693365
Precision: 0.6062992125984252
Recall: 0.4117647058823529
F1 Score: 0.49044585987261147
MCC: 0.44898423824645534


In [28]:
#predict probabilities with threshold tuning
y_prednbproba = nb_model.predict_proba(X_test)[:, 1]
y_prednb = (y_prednbproba >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsnb = calc_metrics(y_test, y_prednb,y_prednbproba)
print(f"Evaluation Metrics of Naive Bayes Model:")
for metricnb, value in metricsnb.items():   
    print(f"{metricnb}: {value}")

Evaluation Metrics of Naive Bayes Model:
Confusion Matrix: [[6191 1112]
 [ 323  612]]
Accuracy: 0.8258072347657198
AUC-ROC: 0.830209253980307
Precision: 0.35498839907192575
Recall: 0.6545454545454545
F1 Score: 0.46032342986084995
MCC: 0.3916599786112249


In [29]:
#predict probabilities for Random Forest
y_predrfprob = rf_model.predict_proba(X_test)[:, 1]
y_predrf     = (y_predrfprob >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsrf = calc_metrics(y_test, y_predrf,y_predrfprob)
print(f"Evaluation Metrics of Random Forest with max_depth={rf_max_depth}, min_samples_leaf={min_samples_leaf}, n_estimators={n_estimators}")
for metricrf, value in metricsrf.items():   
    print(f"{metricrf}: {value}")

Evaluation Metrics of Random Forest with max_depth=15, min_samples_leaf=50, n_estimators=300
Confusion Matrix: [[6012 1291]
 [  71  864]]
Accuracy: 0.8346686088856519
AUC-ROC: 0.9356488909033794
Precision: 0.4009280742459397
Recall: 0.9240641711229947
F1 Score: 0.5592233009708738
MCC: 0.5393387576194502


In [30]:
#predict probabilities with threshold tuning
y_probxgb = xgb_model.predict_proba(X_test)[:, 1]
y_predxgb = (y_probxgb >= 0.5).astype(int)

#Call calc_metrics function to calculate evaluation metrics
metricsxgb = calc_metrics(y_test, y_predxgb,y_probxgb)
print(f"Evaluation Metrics of XGBoost with max_depth={xgb_max_depth},  n_estimators={n_estimators} with learning_rate={learning_rate} ")
for metricxgb, value in metricsxgb.items():   
    print(f"{metricxgb}: {value}")

Evaluation Metrics of XGBoost with max_depth=5,  n_estimators=300 with learning_rate=0.05 
Confusion Matrix: [[6292 1011]
 [  82  853]]
Accuracy: 0.8673221655741685
AUC-ROC: 0.9487091598866775
Precision: 0.45761802575107297
Recall: 0.9122994652406418
F1 Score: 0.6095033940693104
MCC: 0.5866670144035899


Saving the Model for prediction

In [31]:
# For model saving
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

In [32]:
# Save all models and preprocessing objects
jb.dump(log_model, os.path.join(model_dir, 'logistic_model.pkl'))
print("✓ Logistic model saved")

jb.dump(dt_model, os.path.join(model_dir, 'decision_tree_model.pkl'))
print("✓ Decision tree model saved")

jb.dump(knn_model, os.path.join(model_dir, 'knn_model.pkl'))
print("✓ kNN model saved")

jb.dump(nb_model, os.path.join(model_dir, 'nb_model.pkl'))
print("✓ Naive Bayes model saved")

jb.dump(rf_model, os.path.join(model_dir, 'rf_model.pkl'))
print("✓ Random Forest model saved")

jb.dump(xgb_model, os.path.join(model_dir, 'xgb_model.pkl'))
print("✓ XGBoost model saved")

# IMPORTANT: Save the scaler and feature names for future predictions
jb.dump(scaler, os.path.join(model_dir, 'scaler.pkl'))
print("✓ Scaler saved")

feature_names = X_train.columns.tolist()
jb.dump(feature_names, os.path.join(model_dir, 'feature_names.pkl'))
print("✓ Feature names saved")

print("\nAll models and preprocessing objects saved successfully!")

✓ Logistic model saved
✓ Decision tree model saved
✓ kNN model saved
✓ Naive Bayes model saved
✓ Random Forest model saved
✓ XGBoost model saved
✓ Scaler saved
✓ Feature names saved

All models and preprocessing objects saved successfully!


## Cross-Validation Evaluation

In [33]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Perform 5-fold cross-validation for all models
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'Logistic Regression': log_model,
    'Decision Tree': dt_model,
    'kNN': knn_model,
    'Naive Bayes': nb_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model
}

print("Cross-Validation Results (5-Fold):")
print("="*60)

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    print(f"{name}:")
    print(f"  F1 Score: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print()

Cross-Validation Results (5-Fold):
Logistic Regression:
  F1 Score: 0.5881 (+/- 0.0081)

Decision Tree:
  F1 Score: 0.5571 (+/- 0.0129)

kNN:
  F1 Score: 0.4803 (+/- 0.0128)

Naive Bayes:
  F1 Score: 0.4494 (+/- 0.0109)

Random Forest:
  F1 Score: 0.5627 (+/- 0.0081)

XGBoost:
  F1 Score: 0.6173 (+/- 0.0101)



## Model Performance Comparison

In [34]:
# Create a comparison dataframe of all models
comparison_data = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'AUC-ROC': [],
    'MCC': []
}

all_metrics = {
    'Logistic Regression': metricslog,
    'Decision Tree': metricsdt,
    'kNN': metricsknn,
    'Naive Bayes': metricsnb,
    'Random Forest': metricsrf,
    'XGBoost': metricsxgb
}

for model_name, metrics in all_metrics.items():
    comparison_data['Model'].append(model_name)
    comparison_data['Accuracy'].append(metrics['Accuracy'])
    comparison_data['Precision'].append(metrics['Precision'])
    comparison_data['Recall'].append(metrics['Recall'])
    comparison_data['F1 Score'].append(metrics['F1 Score'])
    comparison_data['AUC-ROC'].append(metrics['AUC-ROC'])
    comparison_data['MCC'].append(metrics['MCC'])

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)

print("Model Performance Comparison:")
print("="*80)
print(comparison_df.to_string(index=False))

# Find best model for each metric
print("\n" + "="*80)
print("Best Models by Metric:")
print("="*80)
for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC', 'MCC']:
    best_idx = comparison_df[metric].idxmax()
    best_model = comparison_df.loc[best_idx, 'Model']
    best_score = comparison_df.loc[best_idx, metric]
    print(f"{metric:15s}: {best_model:20s} ({best_score:.4f})")

Model Performance Comparison:
              Model  Accuracy  Precision  Recall  F1 Score  AUC-ROC    MCC
Logistic Regression    0.8616     0.4453  0.8930    0.5943   0.9383 0.5679
      Decision Tree    0.8362     0.4027  0.9166    0.5596   0.9356 0.5381
                kNN    0.9029     0.6063  0.4118    0.4904   0.9047 0.4490
        Naive Bayes    0.8258     0.3550  0.6545    0.4603   0.8302 0.3917
      Random Forest    0.8347     0.4009  0.9241    0.5592   0.9356 0.5393
            XGBoost    0.8673     0.4576  0.9123    0.6095   0.9487 0.5867

Best Models by Metric:
Accuracy       : kNN                  (0.9029)
Precision      : kNN                  (0.6063)
Recall         : Random Forest        (0.9241)
F1 Score       : XGBoost              (0.6095)
AUC-ROC        : XGBoost              (0.9487)
MCC            : XGBoost              (0.5867)
