# Load Essential Python Libraries

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Load Training/ Test Dataset

In [46]:
df = pd.read_csv("bank-additional-full.csv", sep=';')

# About the dataset 

In [47]:
#Size of dataset
df.shape

(41188, 21)

In [48]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [49]:
#Information about train Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [50]:
# First look at the Dataset
df.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
11,25,services,single,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
39715,29,services,single,high.school,no,yes,no,cellular,may,fri,...,1,999,2,failure,-1.8,93.876,-40.0,0.695,5008.7,no
12420,52,services,married,basic.6y,no,no,yes,cellular,jul,mon,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.96,5228.1,no
8905,32,technician,single,high.school,no,yes,no,telephone,jun,thu,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.866,5228.1,no
35837,26,technician,single,university.degree,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.259,5099.1,yes


# Data Preprocessing

In [51]:
# Dropping the unwanted column:
df.drop('duration', inplace=True, axis='columns')

In [52]:
# Identify missing values:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [114]:
from sklearn.model_selection import train_test_split
df['y']=df['y'].replace({'yes':1,'no':0})



#X_Train,X_Test,y_Train,y_Test = train_test_split(df.drop('y', axis='columns'),df['y'],test_size=0.4,stratify=df['y'])
df1=df.sample(16000)
X=df1.drop('y', axis='columns')
y=df1['y']


In [115]:
X.shape

(16000, 19)

In [116]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y)

# **Creating Pipeline**

In [117]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define the categorical columns
categorical_cols = ["job", "marital", "education",'default', 'housing', 'loan','contact', 'month', 'day_of_week','poutcome']

In [118]:
# Create a ColumnTransformer for categorical features
pre_process_cat = ColumnTransformer([("ohe", OneHotEncoder(handle_unknown="ignore"), [1,2,3,4,5,6,7,8,9,13])],remainder='passthrough')

# ML Algos

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=1)
pipe_dt = make_pipeline(pre_process_cat,dt)


from sklearn.ensemble import RandomForestClassifier
rfe = RandomForestClassifier()
pipe_rfe = make_pipeline(pre_process_cat,rfe)


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100,penalty='l2')
pipe_lr = make_pipeline(pre_process_cat,lr)


from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 13, weights='uniform')
pipe_knn = make_pipeline(pre_process_cat,knn)

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(var_smoothing= 0.01)
pipe_naive = make_pipeline(pre_process_cat,nb)

import lightgbm as lgb
lgbm = lgb.LGBMClassifier(boosting_type= 'gbdt', learning_rate= 0.05, min_child_samples= 10, num_class= 1, num_leaves= 20, objective= 'binary', reg_alpha= 0.5, reg_lambda= 0.1)
pipe_lgbm = make_pipeline(pre_process_cat,lgbm)

from sklearn.ensemble import StackingClassifier
stc=StackingClassifier(estimators=[('lgbm', lgbm), ('knn', knn), ('lr', lr)], final_estimator=rfe)
pipe_stc = make_pipeline(pre_process_cat,stc)




In [83]:
def ml_score(pipe,X_train,y_train):
    print(f'Accuracy score: {cross_val_score(pipe, X_train, y_train, cv=5, scoring="accuracy").mean()}')

In [84]:
ml_score(pipe_rfe,X_train,y_train)

KeyboardInterrupt: 

In [None]:
ml_score(pipe_lr,X_train,y_train)

Accuracy score: 0.899514415781487


In [None]:
ml_score(pipe_naive,X_train,y_train)

Accuracy score: 0.8949317147192716


In [None]:
ml_score(pipe_knn,X_train,y_train)

Accuracy score: 0.8955993930197268


In [100]:
ml_score(pipe_stc,X_train,y_train)

Accuracy score: 0.8946666666666665


In [119]:
ml_score(pipe_lgbm,X_train,y_train)

Accuracy score: 0.9014843749999999


In [120]:
# Fit the pipeline
pipe_stc.fit(X_train, y_train)


In [121]:
import pickle
pickle.dump(pipe_stc,open('pipe.pkl','wb'))

In [122]:
# Assume user input
test_input = np.array([30,'admin.','married',	'university.degree',	'no',	'yes',	'no',	'cellular',	'jun',	'mon',	1,	999,	0,	'nonexistent',	-2.9,	92.963,	-40.8,	1.260,	5076.2],dtype=object).reshape(1,19)


In [123]:
pipe_stc.predict(test_input)

array([1], dtype=int64)

In [None]:
# Fit the pipeline
pipe_rfe.fit(X_train, y_train)


In [None]:

# get feature importances
importances = pipe_rfe.named_steps['randomforestclassifier'].feature_importances_

# map feature importance scores to original feature names
encoded_feature_names = pipe_rfe.named_steps['ohe'].get_feature_names(input_features=X_train.columns)
feature_importances = pd.Series(importances, index=encoded_feature_names)
feature_importances = feature_importances.groupby(feature_importances.index.str.split('_').str[0]).sum()

# sort feature importances in descending order
feature_importances_sorted = feature_importances.sort_values(ascending=False)

# print feature importances
print("Feature Importances:")
for feature, importance in feature_importances_sorted.items():
    print(f"{feature}: {importance:.4f}")


In [None]:
pipe_rfe.named_steps['ohe'].get_feature_names(input_features=X_train.columns)

KeyError: 'ohe'