### Classification model to predict whether a bank customer took up an offer from the bank's marketing offering or not.

In [1]:
#Importing all the required packages
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from statistics import mean
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score


In [2]:
##importing the data from the csv file into a dataframe using pandas

df = pd.read_csv('bank.csv')
print(df.shape)

(4521, 17)


### Observations

In [3]:
df['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [5]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
#We'll convert all the categorical data into numerical data for further processing
obj_df = df.select_dtypes(include=['object']).copy()

df["y"] = obj_df["y"].astype('category')
df["y"] = df["y"].cat.codes

y = df.pop('y').values
obj_df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,unemployed,married,primary,no,no,no,cellular,oct,unknown,no
1,services,married,secondary,no,yes,yes,cellular,may,failure,no
2,management,single,tertiary,no,yes,no,cellular,apr,failure,no
3,management,married,tertiary,no,yes,yes,unknown,jun,unknown,no
4,blue-collar,married,secondary,no,yes,no,unknown,may,unknown,no


In [7]:
#Using LabelEncoder to encode categorical data to numeric data
df["job"] = obj_df["job"].astype('category')
df["job"] = df["job"].cat.codes

df["marital"] = obj_df["marital"].astype('category')
df["marital"] = df["marital"].cat.codes

df["default"] = obj_df["default"].astype('category')
df["default"] = df["default"].cat.codes

df["housing"] = obj_df["housing"].astype('category')
df["housing"] = df["housing"].cat.codes

df["loan"] = obj_df["loan"].astype('category')
df["loan"] = df["loan"].cat.codes

df["contact"] = obj_df["contact"].astype('category')
df["contact"] = df["contact"].cat.codes

df["poutcome"] = obj_df["poutcome"].astype('category')
df["poutcome"] = df["poutcome"].cat.codes
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,10,1,primary,0,1787,0,0,0,19,oct,79,1,-1,0,3
1,33,7,1,secondary,0,4789,1,1,0,11,may,220,1,339,4,0
2,35,4,2,tertiary,0,1350,1,0,0,16,apr,185,1,330,1,0
3,30,4,1,tertiary,0,1476,1,1,2,3,jun,199,4,-1,0,3
4,59,1,1,secondary,0,0,1,0,2,5,may,226,1,-1,0,3


In [8]:
ord_enc = OrdinalEncoder()
df["education"] = ord_enc.fit_transform(df[["education"]])
df["month"] = ord_enc.fit_transform(df[["month"]])
df.head(11)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,10,1,0.0,0,1787,0,0,0,19,10.0,79,1,-1,0,3
1,33,7,1,1.0,0,4789,1,1,0,11,8.0,220,1,339,4,0
2,35,4,2,2.0,0,1350,1,0,0,16,0.0,185,1,330,1,0
3,30,4,1,2.0,0,1476,1,1,2,3,6.0,199,4,-1,0,3
4,59,1,1,1.0,0,0,1,0,2,5,8.0,226,1,-1,0,3
5,35,4,2,2.0,0,747,0,0,0,23,3.0,141,2,176,3,0
6,36,6,1,2.0,0,307,1,0,0,14,8.0,341,1,330,2,1
7,39,9,1,1.0,0,147,1,0,0,6,8.0,151,2,-1,0,3
8,41,2,1,2.0,0,221,1,0,2,14,8.0,57,2,-1,0,3
9,43,7,1,0.0,0,-88,1,1,0,17,0.0,313,1,147,2,0


* Normalization - Why?

In [9]:
scaler = preprocessing.MinMaxScaler()
names = df.columns
d = scaler.fit_transform(df)
scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()
X = scaled_df.values

feature_names = df.columns
scaled_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,0.161765,0.909091,0.5,0.0,0.0,0.068455,0.0,0.0,0.0,0.6,0.909091,0.024826,0.0,0.0,0.0,1.0
1,0.205882,0.636364,0.5,0.333333,0.0,0.10875,1.0,1.0,0.0,0.333333,0.727273,0.0715,0.0,0.389908,0.16,0.0
2,0.235294,0.363636,1.0,0.666667,0.0,0.06259,1.0,0.0,0.0,0.5,0.0,0.059914,0.0,0.379587,0.04,0.0
3,0.161765,0.363636,0.5,0.666667,0.0,0.064281,1.0,1.0,1.0,0.066667,0.545455,0.064548,0.061224,0.0,0.0,1.0
4,0.588235,0.090909,0.5,0.333333,0.0,0.044469,1.0,0.0,1.0,0.133333,0.727273,0.073486,0.0,0.0,0.0,1.0


### Observation: Data is biased. Majority class samples > minority class samples
* Why it is important to rectify bias?
* Methods - Upsampling and downsampling
* Why SMOTE?

In [10]:
#Using SMOTE upsampler to upsample the minority class in the dataset to rectify the bias.
sm = SMOTE(random_state=1)
X_res, y_res = sm.fit_sample(X, y)

**Below we can see that this strategy upsampled the minority class count to same as the majority class count.**

In [11]:
print("Before upsampling the training data using SMOTE")
print("Majority class No:",Counter(y)[0])
print("Minority class Yes:",Counter(y)[1])

print("\nAfter upsampling the training data using SMOTE")
print("Majority class No:",Counter(y_res)[0])
print("Minority class Yes:",Counter(y_res)[1])

Before upsampling the training data using SMOTE
Majority class No: 4000
Minority class Yes: 521

After upsampling the training data using SMOTE
Majority class No: 4000
Minority class Yes: 4000


In [12]:
model_d = {}

model_d['dtree'] = DecisionTreeClassifier(criterion='entropy')    
model_d['kNN'] = KNeighborsClassifier(n_neighbors=3) 
model_d['logistic'] = LogisticRegression(max_iter=10000, solver = 'sag')
model_d['gb'] = GradientBoostingClassifier()

In [13]:
for m in model_d:
    f1_scores = cross_val_score(model_d[m],X_res, y_res, cv=10,scoring='f1')
    print("F1 Score of {:22} :  {:.2f}".format(type(model_d[m]).__name__, f1_scores.mean()))

F1 Score of DecisionTreeClassifier :  0.90
F1 Score of KNeighborsClassifier   :  0.92
F1 Score of LogisticRegression     :  0.81
F1 Score of GradientBoostingClassifier :  0.91
