# **Bank Marketing Classification Project**

Bank Marketing Subscription Prediction

Predicting Customer Term Deposit Subscription Using Machine Learning



1)Customer Demographics:
Age, job, marital status, education

2)Financial Status:
Account balance, housing loan, personal loan

3)Campaign Information:
Number of contacts made, month of contact, previous campaign results

In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler


from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix



In [46]:
df = pd.read_csv("/content/shyam_classification.csv")


In [47]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [48]:
df.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


In [49]:
df.shape

(45211, 17)

In [50]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [51]:
df.isna().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


In [52]:
# imbalance
df["y"].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
y,Unnamed: 1_level_1
no,88.30152
yes,11.69848


In [53]:
# Convert target to binary
df["y"] = df["y"].map({"yes": 1, "no": 0})

In [54]:
# Drop leakage feature
df = df.drop("duration", axis=1)

In [55]:
X = df.drop("y", axis=1)
y = df["y"]


In [56]:
# Identify numeric & categorical columns
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(exclude=np.number).columns

In [57]:
# One-Hot Encoding
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [59]:
# Scaling (only numeric columns)
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [60]:
# KNN
k_model = KNeighborsClassifier(n_neighbors=3)
k_model.fit(X_train, y_train)

In [61]:
# Naive Bayes
n_model = GaussianNB()
n_model.fit(X_train, y_train)

In [62]:
# SVM
s_model = SVC(probability=True)
s_model.fit(X_train, y_train)

In [63]:
# Random Forest
r_model = RandomForestClassifier(random_state=42)
r_model.fit(X_train, y_train)

In [64]:
# Logistic Regression
l_model = LogisticRegression(max_iter=5000)
l_model.fit(X_train, y_train)

In [65]:

# Evaluation
models = [k_model, n_model, s_model, r_model, l_model]

for model in models:

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print("*"*50)
    print(model)
    print("Accuracy:", accuracy)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

**************************************************
KNeighborsClassifier(n_neighbors=3)
Accuracy: 0.8795340607490416
[[11575   402]
 [ 1232   355]]
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     11977
           1       0.47      0.22      0.30      1587

    accuracy                           0.88     13564
   macro avg       0.69      0.60      0.62     13564
weighted avg       0.85      0.88      0.86     13564

**************************************************
GaussianNB()
Accuracy: 0.853361840165143
[[10916  1061]
 [  928   659]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     11977
           1       0.38      0.42      0.40      1587

    accuracy                           0.85     13564
   macro avg       0.65      0.66      0.66     13564
weighted avg       0.86      0.85      0.86     13564

**************************************************
SVC(probability=True)
Acc

In [66]:
ap=pd.DataFrame({"actual value":y_test,"predicted value":y_pred})
ap

Unnamed: 0,actual value,predicted value
37735,1,0
44332,1,0
4432,0,0
38725,1,0
38581,0,0
...,...,...
15913,0,0
20622,1,0
41684,0,0
32749,0,0
