In [25]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score, matthews_corrcoef
)
import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [26]:
# TODO: Load your dataset
# Example: data = pd.read_csv('your_dataset.csv')

data = pd.read_csv(r'C:\Users\Dell\Downloads\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv', sep=';')

# Dataset information (TODO: Fill these)
dataset_name = "Bank Marketing"  # e.g., "Breast Cancer Wisconsin"
dataset_source = "UCI ML Repository"  # e.g., "UCI ML Repository"
n_samples = 41188      # Total number of rows
n_features = 20     # Number of features (excluding target)
problem_type = "binary_classification"  # "regression" or "binary_classification" or "multiclass_classification"

# Problem statement (TODO: Write 2-3 sentences)
problem_statement = """The classification goal is to predict if the client will subscribe a term deposit or not. 
This is a binary classification problem where the target variable indicates whether the client has subscribed ('yes') or not ('no'). 
The dataset contains various features related to client demographics, bank attributes, and previous marketing campaigns."
"""

# Primary evaluation metric (TODO: Fill this)
primary_metric = "Precision"  # e.g., "recall", "accuracy", "rmse", "r2"

# Metric justification (TODO: Write 2-3 sentences)
metric_justification = """
"I chose Precision because in Banking Marketing, it is crucial to minimize false positives. 
A high precision ensures that when the model predicts a client will subscribe, it is likely to be correct. 
This is important to avoid wasting resources on clients who are unlikely to subscribe."
"""

print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Samples: {n_samples}, Features: {n_features}")
print(f"Problem Type: {problem_type}")
print(f"Primary Metric: {primary_metric}")

Dataset: Bank Marketing
Source: UCI ML Repository
Samples: 41188, Features: 20
Problem Type: binary_classification
Primary Metric: Precision


In [27]:
# TODO: Preprocess your data
# 1. Separate features (X) and target (y)
# 2. Handle missing values if any
# 3. Encode categorical variables

# Drop the specified columns from data and filter out rows with 'unknown' in certain columns
columns_to_drop = ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed','pdays']
data = data.drop(columns=columns_to_drop)
data = data[(data['education'] != 'unknown') & (data['default'] != 'unknown') & (data['housing'] != 'unknown')].reset_index(drop=True)

# One-hot encode categorical_cols, drop originals, and convert to int
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact','poutcome']
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=False)
for col in data_encoded.columns:
    if any(col.startswith(f"{cat}_") for cat in categorical_cols):
        data_encoded[col] = data_encoded[col].astype(int)

# Convert 'month' and 'day_of_week' columns to numeric values
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
day_mapping = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5}
y_mapping = {'no': 0, 'yes': 1}


data_encoded['month'] = data_encoded['month'].map(month_mapping)
data_encoded['day_of_week'] = data_encoded['day_of_week'].map(day_mapping)
data_encoded['y'] = data_encoded['y'].map(y_mapping)


In [28]:
data_encoded.head()

Unnamed: 0,age,month,day_of_week,duration,campaign,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,5,1,261,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
1,37,5,1,226,1,0,0,0,0,0,...,0,0,1,1,0,0,1,0,1,0
2,40,5,1,151,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0
3,56,5,1,307,1,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,59,5,1,139,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0


In [29]:
scaler = StandardScaler()

In [30]:
data_encoded["age"] = scaler.fit_transform(data_encoded[['age']])
data_encoded["duration"] = scaler.fit_transform(data_encoded[['duration']])

In [31]:
data_encoded.head()

Unnamed: 0,age,month,day_of_week,duration,campaign,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,poutcome_failure,poutcome_nonexistent,poutcome_success
0,1.639733,5,1,0.005026,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
1,-0.197378,5,1,-0.12854,1,0,0,0,0,0,...,0,0,1,1,0,0,1,0,1,0
2,0.092692,5,1,-0.414752,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0
3,1.639733,5,1,0.18057,1,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,1.929803,5,1,-0.460546,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,1,0


In [32]:
#count of class 0 and class 1 data_encoded['y']
print("Overall class distribution:")
print(data_encoded['y'].value_counts())

Overall class distribution:
y
0    26781
1     3886
Name: count, dtype: int64


In [33]:
# Separate features and target
X = data_encoded.drop('y', axis=1)
y = data_encoded['y']

# TODO: Train-test split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [34]:
#count of class 0 and class 1 in y_train and y_test
print("Training set class distribution:")
print(y_train.value_counts())
print("\nTest set class distribution:")
print(y_test.value_counts())

Training set class distribution:
y
0    21424
1     3109
Name: count, dtype: int64

Test set class distribution:
y
0    5357
1     777
Name: count, dtype: int64


In [35]:
# TODO: Feature scaling
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [36]:

# Fill these after preprocessing
total_samples = data_encoded.shape[0]  # Total number of samples after preprocessing
train_samples = X_train_scaled.shape[0]       # Number of training samples
test_samples  = X_test_scaled.shape[0]        # Number of test samples
train_test_ratio = train_samples/total_samples  # e.g., 0.8 for 80-20 split


print(f"Total samples: {total_samples}")
print(f"Train samples: {train_samples}")
print(f"Test samples: {test_samples}")
print(f"Split ratio: {train_test_ratio:.1%}")

Total samples: 30667
Train samples: 24533
Test samples: 6134
Split ratio: 80.0%


In [37]:
lr_model = LogisticRegression(class_weight='balanced',max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [38]:
y_predlr = lr_model.predict(X_test_scaled)

In [39]:
lr_cm = confusion_matrix(y_test, y_predlr)
TN, FP, FN, TP = lr_cm.ravel()

In [40]:
print("Confusion Matrix:\n", lr_cm)
print("Accuracy:", accuracy_score(y_test, y_predlr))
print("AUC-ROC:", roc_auc_score(y_test, y_predlr))
print("precision:", precision_score(y_test, y_predlr))
print("recall:", recall_score(y_test, y_predlr))
print("F1 Score:", f1_score(y_test, y_predlr))
print('MCC score:', matthews_corrcoef(y_test, y_predlr))


Confusion Matrix:
 [[4460  897]
 [ 182  595]]
Accuracy: 0.8240952070427128
AUC-ROC: 0.7991606502900137
precision: 0.3987935656836461
recall: 0.7657657657657657
F1 Score: 0.5244601145879242
MCC score: 0.4638406613216309


In [41]:
dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt_model.fit(X_train_scaled, y_train)
y_preddt = dt_model.predict(X_test_scaled)
dt_cm = confusion_matrix(y_test, y_preddt)
TN, FP, FN, TP = dt_cm.ravel()

In [42]:
print("Confusion Matrix:\n", dt_cm)
print("Accuracy:", accuracy_score(y_test, y_preddt))
print("AUC-ROC:", roc_auc_score(y_test, y_preddt))
print("precision:", precision_score(y_test, y_preddt))
print("recall:", recall_score(y_test, y_preddt))
print("F1 Score:", f1_score(y_test, y_preddt))
print('MCC score:', matthews_corrcoef(y_test, y_preddt))

Confusion Matrix:
 [[4924  433]
 [ 442  335]]
Accuracy: 0.8573524616889469
AUC-ROC: 0.6751583045217543
precision: 0.4361979166666667
recall: 0.43114543114543114
F1 Score: 0.4336569579288026
MCC score: 0.3520676462238392


In [43]:
#model 3: K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)
y_predknn = knn_model.predict(X_test_scaled)
knn_cm = confusion_matrix(y_test, y_predknn)
TN, FP, FN, TP = knn_cm.ravel()

In [44]:
print("Confusion Matrix:\n", knn_cm)
print("Accuracy:", accuracy_score(y_test, y_predknn))
print("AUC-ROC:", roc_auc_score(y_test, y_predknn))
print("precision:", precision_score(y_test, y_predknn))
print("recall:", recall_score(y_test, y_predknn))
print("F1 Score:", f1_score(y_test, y_predknn))
print('MCC score:', matthews_corrcoef(y_test, y_predknn))

Confusion Matrix:
 [[5192  165]
 [ 537  240]]
Accuracy: 0.885555917835018
AUC-ROC: 0.6390397437625364
precision: 0.5925925925925926
recall: 0.3088803088803089
F1 Score: 0.40609137055837563
MCC score: 0.372454618749479
