# Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
df = pd.read_csv('bank.csv')
print(df.head())
categorical_columns = ['job','marital','education','default','housing','loan','contact','month','poutcome']
for cat in categorical_columns:
    print("Column '{cat}':{values}".format(cat=cat,values=df[cat].unique()))

   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  pdays  previous poutcome deposit  
0    5   may      1042     -1         0  unknown     yes  
1    5   may      1467     -1         0  unknown     yes  
2    5   may      1389     -1         0  unknown     yes  
3    5   may       579     -1         0  unknown     yes  
4    5   may       673     -1         0  unknown     yes  
Column 'job':['admin.' 'technician' 'services' 'management' 'retired' 'blue-collar'
 'unemployed' 'entrepreneur' 'housemaid' 'unknown' 'self-employed'
 'studen

In [2]:
df['class']=df['deposit'].apply(lambda x:1 if x=='yes' else 0)

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,pdays,previous,poutcome,deposit,class
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,-1,0,unknown,yes,1
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,-1,0,unknown,yes,1
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,-1,0,unknown,yes,1
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,-1,0,unknown,yes,1
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,-1,0,unknown,yes,1


In [4]:
df_new  = pd.get_dummies(df,columns=categorical_columns)
df_new.head()

Unnamed: 0,age,balance,day,duration,pdays,previous,deposit,class,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,59,2343,5,1042,-1,0,yes,1,1,0,...,0,0,1,0,0,0,0,0,0,1
1,56,45,5,1467,-1,0,yes,1,1,0,...,0,0,1,0,0,0,0,0,0,1
2,41,1270,5,1389,-1,0,yes,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,55,2476,5,579,-1,0,yes,1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,54,184,5,673,-1,0,yes,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [5]:
x = df_new.drop(['deposit','class'],axis=1)
y = df['class']

# Split the Data Set into train and test

In [6]:
from sklearn.model_selection import train_test_split
x_tr, x_test, y_tr, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Training Model - Logistic Regression

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import warnings
warnings.filterwarnings("ignore") 
classifier = LogisticRegression(C=0.1,penalty='l2',class_weight='balanced')
classifier.fit(x_tr, y_tr)
y_pred = classifier.predict(x_test)
print(classification_report(y_test, y_pred))
print('Training accuracy:', classifier.score(x_tr, y_tr))
print('Test accuracy:', classifier.score(x_test, y_test))

confusion_matrix = confusion_matrix(y_test, y_pred) 
sns.set(font_scale=1.4)
sns.heatmap(confusion_matrix, annot=True,annot_kws={"size": 16})

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1166
           1       0.81      0.80      0.80      1067

   micro avg       0.81      0.81      0.81      2233
   macro avg       0.81      0.81      0.81      2233
weighted avg       0.81      0.81      0.81      2233

Training accuracy: 0.833351999104043
Test accuracy: 0.8137035378414689


<matplotlib.axes._subplots.AxesSubplot at 0x33b77ca780>

# Saving the Model

In [8]:
import pickle
pickle.dump(classifier,open('classifier.pkl','wb'))