# Load Essential Python Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Load Training/ Test Dataset

In [2]:
df = pd.read_csv("bank-additional-full.csv", sep=';')

# About the dataset 

In [3]:
#Size of dataset
df.shape

(41188, 21)

In [4]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [5]:
#Information about train Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [6]:
# First look at the Dataset
df.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
33089,32,management,single,professional.course,no,no,no,cellular,may,tue,...,2,999,1,failure,-1.8,92.893,-46.2,1.291,5099.1,no
39395,56,admin.,single,university.degree,no,no,yes,cellular,apr,thu,...,1,999,1,failure,-1.8,93.749,-34.6,0.635,5008.7,no
1223,30,housemaid,married,high.school,unknown,no,no,telephone,may,thu,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
26483,59,retired,married,professional.course,no,no,no,cellular,nov,thu,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
6907,54,management,married,university.degree,no,yes,no,telephone,may,thu,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no


# Data Preprocessing

In [7]:
# Dropping the unwanted column:
df.drop('duration', inplace=True, axis='columns')

In [8]:
# Identify missing values:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
df['y']=df['y'].replace({'yes':1,'no':0})
X_train,X_test,y_train,y_test = train_test_split(df.drop('y', axis='columns'),df['y'],test_size=0.2,stratify=df['y'])

In [10]:
df.sample(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
2263,38,blue-collar,married,basic.6y,no,yes,no,telephone,may,tue,1,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,0
11656,37,admin.,married,basic.6y,no,no,no,telephone,jun,fri,14,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,0
5570,44,admin.,married,high.school,no,yes,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
8384,42,services,divorced,basic.6y,no,no,no,telephone,jun,tue,1,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,0
40141,29,technician,single,basic.9y,no,yes,no,telephone,jul,wed,1,6,1,success,-1.7,94.215,-40.3,0.84,4991.6,1
18527,52,blue-collar,married,basic.4y,no,yes,no,cellular,jul,thu,6,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,0
29244,47,management,married,professional.course,no,yes,no,cellular,apr,fri,2,999,1,failure,-1.8,93.075,-47.1,1.405,5099.1,0
1410,38,entrepreneur,married,basic.6y,no,yes,yes,telephone,may,thu,2,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,0
18391,39,technician,married,high.school,no,no,no,cellular,jul,thu,1,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,0
19985,48,housemaid,married,basic.6y,unknown,no,no,cellular,aug,fri,8,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,0


# **Creating Pipeline**

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define the categorical columns
categorical_cols = ["job", "marital", "education",'default', 'housing', 'loan','contact', 'month', 'day_of_week','poutcome']

In [12]:
# Create a ColumnTransformer for categorical features
pre_process_cat = ColumnTransformer([("ohe", OneHotEncoder(handle_unknown="ignore"), [1,2,3,4,5,6,7,8,9,13])],remainder='passthrough')

# ML Algos

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=1)
pipe_dt = make_pipeline(pre_process_cat,dt)


from sklearn.ensemble import RandomForestClassifier
rfe = RandomForestClassifier()
pipe_rfe = make_pipeline(pre_process_cat,rfe)


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100,penalty='l2')
pipe_lr = make_pipeline(pre_process_cat,lr)


from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 13, weights='uniform')
pipe_knn = make_pipeline(pre_process_cat,knn)

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(var_smoothing= 0.01)
pipe_naive = make_pipeline(pre_process_cat,nb)

import lightgbm as lgb
lgbm = lgb.LGBMClassifier(boosting_type= 'gbdt', learning_rate= 0.05, min_child_samples= 10, num_class= 1, num_leaves= 20, objective= 'binary', reg_alpha= 0.5, reg_lambda= 0.1)
pipe_lgbm = make_pipeline(pre_process_cat,lgbm)

from sklearn.ensemble import StackingClassifier
stc=StackingClassifier(estimators=[('lgbm', lgbm), ('knn', knn), ('lr', lr)], final_estimator=rfe)
pipe_stc = make_pipeline(pre_process_cat,stc)




In [58]:
def ml_score(pipe,X_train,y_train):
    print(f'Accuracy score: {cross_val_score(pipe, X_train, y_train, cv=5, scoring="accuracy").mean()}')

In [68]:
ml_score(pipe_rfe,X_train,y_train)

Accuracy score: 0.8932928679817905


In [74]:
pipe_rfe.named_steps['randomforestclassifier'].feature_importances_

array([1.81138631e-02, 1.16864341e-02, 5.64798377e-03, 3.84003494e-03,
       9.20145052e-03, 7.11097098e-03, 6.18244857e-03, 8.75467178e-03,
       5.55202780e-03, 1.41147776e-02, 4.65201244e-03, 2.22454752e-03,
       1.03529372e-02, 1.74752514e-02, 1.57046359e-02, 8.65162641e-04,
       8.28339013e-03, 6.30687144e-03, 1.18668317e-02, 1.65219061e-02,
       1.91883056e-04, 1.21496049e-02, 1.67233632e-02, 6.69330909e-03,
       7.91090305e-03, 7.89121284e-03, 7.66073478e-07, 2.16146447e-02,
       2.39515122e-03, 2.14876660e-02, 1.36544598e-02, 2.49150092e-03,
       1.32161434e-02, 8.14017548e-03, 7.01316959e-03, 3.40231365e-03,
       2.51336580e-03, 8.80093135e-04, 2.69907061e-03, 3.00511973e-03,
       4.40452605e-03, 4.54216572e-03, 2.19263678e-03, 6.36471707e-03,
       3.20863827e-03, 1.41535516e-02, 1.50106445e-02, 1.50113721e-02,
       1.45849853e-02, 1.47383357e-02, 9.58759674e-03, 7.86253888e-03,
       2.52992379e-02, 1.58483244e-01, 8.08570151e-02, 3.25965249e-02,
      

In [32]:
ml_score(pipe_lr,X_train,y_train)

Accuracy score: 0.899514415781487


In [22]:
ml_score(pipe_naive,X_train,y_train)

Accuracy score: 0.8949317147192716


In [23]:
ml_score(pipe_knn,X_train,y_train)

Accuracy score: 0.8955993930197268


In [63]:
ml_score(pipe_stc,X_train,y_train)

Accuracy score: 0.8900455235204856


In [59]:
ml_score(pipe_lgbm,X_train,y_train)

Accuracy score: 0.9010015174506828


In [15]:
# Fit the pipeline
pipe_stc.fit(X_train, y_train)


In [14]:
import pickle
pickle.dump(pipe_stc,open('pipe.pkl','wb'))

In [38]:
# Assume user input
test_input = np.array([30,'admin.','married',	'university.degree',	'no',	'yes',	'no',	'cellular',	'jun',	'mon',	1,	999,	0,	'nonexistent',	-2.9,	92.963,	-40.8,	1.260,	5076.2],dtype=object).reshape(1,19)


In [65]:
pipe_stc.predict(test_input)

array([1], dtype=int64)