# Sharanya Manohar 

## Importing libraries

In [114]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif, VarianceThreshold
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_recall_curve,classification_report,roc_curve, confusion_matrix
import json,pickle

pd.set_option("display.max_columns",None)

In [68]:
df = pd.read_csv("bank.txt", sep=';')

In [69]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


# Data cleaning

In [70]:
for i in df.columns:
    df[i]=np.where(df[i]=='unknown',np.nan,df[i])

In [71]:
df.isnull().sum()

age             0
job            38
marital         0
education     187
default         0
balance         0
housing         0
loan            0
contact      1324
day             0
month           0
duration        0
campaign        0
pdays           0
previous        0
poutcome     3705
y               0
dtype: int64

In [72]:
#It contains lots of null values in poutcome and contact come so lets check with the percentage
print("percentage of null values of poutcome column:",(3705/df.poutcome.shape[0])*100)
print("percentage of null values of contact column:",(1324/df.contact.shape[0])*100)

percentage of null values of poutcome column: 81.95089581950896
percentage of null values of contact column: 29.285556292855563


In [73]:
#Using ffill method
df["job"].fillna(method = "ffill",inplace=True)
df["education"].fillna(method = "ffill",inplace= True)

In [74]:
#Dropping the contact,poutcome,month,day column as it will not affect the result
df.drop("poutcome", inplace = True, axis = 1)
df.drop("contact", inplace = True, axis = 1)
df.drop("month", inplace = True, axis = 1)
df.drop("day", inplace = True, axis = 1)

In [75]:
#Removed all the null values
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
duration     0
campaign     0
pdays        0
previous     0
y            0
dtype: int64

In [76]:
df.dtypes

age          float64
job           object
marital       object
education     object
default       object
balance      float64
housing       object
loan          object
duration     float64
campaign     float64
pdays        float64
previous     float64
y             object
dtype: object

In [77]:
#Lets use some encoding techniques to change categorical to numerical values
df["education"].replace({'primary':0, 'secondary':1, 'tertiary':2},inplace = True)
df["default"].replace({'no':0, 'yes':1},inplace = True)
df["marital"].replace({'single':0, 'married':1, 'divorced':2},inplace = True)
df["housing"].replace({'no':0, 'yes':1},inplace = True)
df["loan"].replace({'no':0, 'yes':1},inplace = True)

In [79]:
df['age']=df['age'].astype(int)

In [84]:
df['job'].unique()

array(['unemployed', 'services', 'management', 'blue-collar',
       'self-employed', 'technician', 'entrepreneur', 'admin.', 'student',
       'housemaid', 'retired'], dtype=object)

In [90]:
from scipy.stats import chi2_contingency
#Testing the relationship
chi_res = chi2_contingency(pd.crosstab(df['job'], df['y']))
print('Chi2 Statistic: {}, p-value: {}'.format(chi_res[0], chi_res[1]))

Chi2 Statistic: 69.93464936845214, p-value: 4.564481037856792e-11


In [91]:
#As job column is dropped due to large p value
df.drop("job", inplace = True, axis = 1)

In [88]:
label = LabelEncoder()
df["y"] =  label.fit_transform(df["y"])

In [92]:
df.dtypes

age            int32
marital        int64
education      int64
default        int64
balance      float64
housing        int64
loan           int64
duration     float64
campaign     float64
pdays        float64
previous     float64
y              int32
dtype: object

In [95]:
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df.values, i)
                          for i in range(len(df.columns))]
  
print(vif_data)

      feature       VIF
0         age  7.527786
1     marital  3.611895
2   education  3.193719
3     default  1.025896
4     balance  1.254416
5     housing  2.140307
6        loan  1.191050
7    duration  2.355792
8    campaign  1.774713
9       pdays  1.776002
10   previous  1.667780
11          y  1.407064


As vif score is less than 5 except age 

In [100]:
column= df[['marital', 'education', 'default', 'housing', 'loan',
        'campaign', 'previous']]
chi, p_val = chi2(column,df["y"])
s = pd.DataFrame({"Chi2":np.around(chi,2), "P_val": np.around(p_val,2)}, index = column.columns)
s

Unnamed: 0,Chi2,P_val
marital,0.43,0.51
education,5.14,0.02
default,0.01,0.93
housing,21.5,0.0
loan,19.05,0.0
campaign,58.5,0.0
previous,325.48,0.0


Chi2 test stats that marital and default features are not important for the model.
chi2 is low and p value is high (means variable is independent from the target feature)

In [102]:
df.drop("marital",inplace = True,axis=1)

In [103]:
df.head()

Unnamed: 0,age,education,default,balance,housing,loan,duration,campaign,pdays,previous,y
0,30,0,0,1787.0,0,0,79.0,1.0,-1.0,0.0,0
1,33,1,0,4789.0,1,1,220.0,1.0,339.0,4.0,0
2,35,2,0,1350.0,1,0,185.0,1.0,330.0,1.0,0
3,30,2,0,1476.0,1,1,199.0,4.0,-1.0,0.0,0
4,59,1,0,0.0,1,0,226.0,1.0,-1.0,0.0,0


# Model building

In [111]:
x = df.drop("y",axis = 1)
y = df["y"]

In [112]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [115]:
std = StandardScaler()
X_train = std.fit_transform(x_train)
X_test = std.fit_transform(x_test)

In [119]:
lst = [("LogisticRegression",LogisticRegression()),
       ("KNN Classifier", KNeighborsClassifier()),
       ("RandomForest",RandomForestClassifier()),
      ("AdaBoost",AdaBoostClassifier()),
      ("XGBoost",XGBClassifier())]
for i,j in lst:
    j.fit(X_train,y_train)
    y1 = j.predict(X_train)
    accuracy = accuracy_score(y_train,y1)
    y2 =j.predict(X_test)
    acc_te = accuracy_score(y_test,y2)
    print(f"For {i}::\nThe Training Accuracy is: {accuracy}\nThe Testing Accuracy is: {acc_te}")
    print("--"*40)

For LogisticRegression::
The Training Accuracy is: 0.8879977876106194
The Testing Accuracy is: 0.8928176795580111
--------------------------------------------------------------------------------
For KNN Classifier::
The Training Accuracy is: 0.9128871681415929
The Testing Accuracy is: 0.8828729281767956
--------------------------------------------------------------------------------
For RandomForest::
The Training Accuracy is: 1.0
The Testing Accuracy is: 0.887292817679558
--------------------------------------------------------------------------------
For AdaBoost::
The Training Accuracy is: 0.8932522123893806
The Testing Accuracy is: 0.8939226519337017
--------------------------------------------------------------------------------
For XGBoost::
The Training Accuracy is: 0.9883849557522124
The Testing Accuracy is: 0.8895027624309392
--------------------------------------------------------------------------------


# Model evaluation

In [124]:
model=KNeighborsClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [126]:
conmat = confusion_matrix(y_test, y_pred)
val = np.mat(conmat) 
classnames = list(set(y_train))

df_cm = pd.DataFrame(

        val, index=classnames, columns=classnames, 

    )

print(df_cm)

     0   1
0  775  32
1   74  24
