In [78]:
# Import required libraries
import pandas as pd
import numpy as np
import joblib as jb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score,precision_score, recall_score,
    f1_score, confusion_matrix, matthews_corrcoef
)

import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [79]:
#Load your dataset

data = pd.read_csv(r'C:\Users\Dell\Downloads\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv', sep=';')

# Dataset information
dataset_name = "Bank Marketing"  # e.g., "Breast Cancer Wisconsin"
dataset_source = "UCI ML Repository"  # e.g., "UCI ML Repository"
n_samples = 41188      # Total number of rows
n_features = 20     # Number of features (excluding target)
problem_type = "binary_classification"  # "regression" or "binary_classification" or "multiclass_classification"

print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Samples: {n_samples}, Features: {n_features}")
print(f"Problem Type: {problem_type}")


Dataset: Bank Marketing
Source: UCI ML Repository
Samples: 41188, Features: 20
Problem Type: binary_classification


In [80]:
# TODO: Preprocess your data
# One-hot encode categorical_cols, drop originals
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact','poutcome']

data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=False)

# One-hot encode 'month' and 'day_of_week' with drop_first=True
data_encoded = pd.get_dummies(data_encoded, columns=['month', 'day_of_week'], drop_first=True)

# Map 'yes'/'no' to 1/0 in target variable 'y'
y_mapping = {'yes': 1,'no': 0 }
data_encoded['y'] = data_encoded['y'].map(y_mapping)

In [81]:
#move target variable y to the end of the dataframe
target = data_encoded.pop('y')
data_encoded['y'] = target

In [82]:
#count of class 0 and class 1 data_encoded['y']
print("Overall class distribution:")
print(data_encoded['y'].value_counts())

Overall class distribution:
y
0    36548
1     4640
Name: count, dtype: int64


In [83]:
scaler = StandardScaler()
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx','cons.conf.idx','euribor3m','nr.employed']
data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

In [84]:
data_encoded.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,y
0,1.533034,0.010471,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,False,True,False,False,False,True,False,False,False,0
1,1.628993,-0.421501,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,False,True,False,False,False,True,False,False,False,0
2,-0.290186,-0.12452,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,False,True,False,False,False,True,False,False,False,0
3,-0.002309,-0.413787,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,False,True,False,False,False,True,False,False,False,0
4,1.533034,0.187888,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,False,True,False,False,False,True,False,False,False,0


In [85]:
# Separate features and target
X = data_encoded.drop('y', axis=1)
y = data_encoded['y']

# TODO: Train-test split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [86]:
#definition for calculating evaluation metrics
def calc_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    TN, FP, FN, TP = cm.ravel()
    accuracy = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    return {
        "Confusion Matrix": cm,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "MCC": mcc
    }

In [87]:
#definition for training logistic regression model
def train_logistic_regression(X_train_scaled, y_train):
    log_model = LogisticRegression(max_iter=1000,class_weight='balanced',solver='liblinear', random_state=42)
    return log_model.fit(X_train_scaled, y_train)

In [88]:
#definition for predicting using logistic regression model
def predict_logistic_regression(log_model, X_test_scaled, threshold=0.5):
    y_predproba = log_model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_predproba >= threshold).astype(int)
    return y_pred

In [89]:
#train model
log_model = train_logistic_regression(X_train_scaled, y_train)

In [None]:
# store the model as pkl using joblib
jb.dump(log_model, 'E:\D Drive\Sai King\M Tech\Semseter 1\ML\Assigment\pkl\logistic_regression_model.pkl')

['E:\\D Drive\\Sai King\\M Tech\\Semseter 1\\ML\\Assigment\\pkl\\logistic_regression_model.pkl']

In [92]:
#Predict without tuning
y_pred = predict_logistic_regression(log_model, X_test_scaled, threshold=0.5)
metrics = calc_metrics(y_test, y_pred)

print("Logistic Regression Model Evaluation Metrics before tuning:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

Logistic Regression Model Evaluation Metrics before tuning:
Confusion Matrix: [[6283 1027]
 [  82  846]]
Accuracy: 0.8653799465889779
AUC-ROC: 0.8855727274871454
Precision: 0.45168179391350777
Recall: 0.9116379310344828
F1 Score: 0.6040699750089253
MCC: 0.5817024422460709


In [93]:
#predict with tuning (example threshold 0.4)
thresholds = [0.45,0.55, 0.65]
for thresh in thresholds:
    y_pred = predict_logistic_regression(log_model, X_test_scaled, threshold=thresh)
    metrics = calc_metrics(y_test, y_pred)

    print(f"Logistic Regression Model Evaluation Metrics with threshold {thresh}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

Logistic Regression Model Evaluation Metrics with threshold 0.45:
Confusion Matrix: [[6159 1151]
 [  66  862]]
Accuracy: 0.8522699684389415
AUC-ROC: 0.8857118849945751
Precision: 0.42821659215101837
Recall: 0.9288793103448276
F1 Score: 0.5861951717103027
MCC: 0.5675892062704349
Logistic Regression Model Evaluation Metrics with threshold 0.55:
Confusion Matrix: [[6391  919]
 [ 102  826]]
Accuracy: 0.876062151007526
AUC-ROC: 0.8821840063210528
Precision: 0.47335243553008594
Recall: 0.8900862068965517
F1 Score: 0.6180321735877291
MCC: 0.5914455436367424
Logistic Regression Model Evaluation Metrics with threshold 0.65:
Confusion Matrix: [[6603  707]
 [ 166  762]]
Accuracy: 0.894027676620539
AUC-ROC: 0.8622019316948912
Precision: 0.5187202178352621
Recall: 0.8211206896551724
F1 Score: 0.6357947434292867
MCC: 0.5983293121829008
