## Import Libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier    #DecisionTreeRegressor

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_curve, auc

## Read File

In [10]:
df = pd.read_csv('bank.csv')

In [11]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [12]:
df.shape

(11162, 17)

## Data Pre-Processing

In [13]:
def preprocessor(df):
    res_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    res_df['job'] = le.fit_transform(res_df['job'])
    res_df['marital'] = le.fit_transform(res_df['marital'])
    res_df['education'] = le.fit_transform(res_df['education'])
    res_df['default'] = le.fit_transform(res_df['default'])
    res_df['housing'] = le.fit_transform(res_df['housing'])
    res_df['loan'] = le.fit_transform(res_df['loan'])
    res_df['contact'] = le.fit_transform(res_df['contact'])
    res_df['month'] = le.fit_transform(res_df['month'])
    res_df['poutcome'] = le.fit_transform(res_df['poutcome'])
    res_df['deposit'] = le.fit_transform(res_df['deposit'])
    return res_df

#get_dummies (read documentation , train and test dataset)

In [14]:
encoded_df = preprocessor(df)
encoded_df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [15]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11158,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11159,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


#### Separating x and y

In [21]:
x = encoded_df.drop('deposit',axis=1)    #creating the independent variables
y = encoded_df['deposit']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=5)

In [22]:
x.shape

(11162, 16)

In [25]:
x_train['deposit']=y_train

In [29]:
x_train2=x_train[x_train['deposit']==0]

In [30]:
x_train2.shape

(4703, 17)

In [24]:
y_train

944     1
6577    0
9128    0
1616    1
939     1
       ..
3046    1
9917    0
4079    1
2254    1
2915    1
Name: deposit, Length: 8929, dtype: int32

### Decision Tree with Cart(Gini)

###### Decision Tree with depth 4

In [12]:

#classification algorithms: instatiate the model object, it follows some random selection
#steps. In order to make these steps reproducible every single time, these models are 
#instantiated with a specific random state.

#In case of decision tree, if no max_depth is provided, a weird pretty large tree would be
#obtained. To avoid that scenario it is preferred to associate a max_depth constraint
#right at the point of instantiation of the model object

In [13]:
#Classification models are instantiated with some random_state
model_dt_4 = DecisionTreeClassifier(random_state=1,max_depth=4)
model_dt_4.fit(x_train,y_train)   #Model creation: it will learn from the train data

y_pred_4 = model_dt_4.predict(x_test)   #to test model use model.predict(x_test)
accuracy_score_4 = accuracy_score(y_test,y_pred_4)
print('Accuracy Score for model with depth 4 is: ',accuracy_score_4)

Accuracy Score for model with depth 4 is:  0.7913121361397224


###### Decision Tree with depth 6

In [14]:
model_dt_6 = DecisionTreeClassifier(random_state=1,max_depth=6)
model_dt_6.fit(x_train,y_train)

y_pred_6 = model_dt_6.predict(x_test)
accuracy_score_6 = accuracy_score(y_test,y_pred_6)
print('Accuracy Score for model with depth 6 is: ',accuracy_score_6)


Accuracy Score for model with depth 6 is:  0.8083296014330497


###### Decision Tree with depth 8

In [15]:
model_dt_8 = DecisionTreeClassifier(random_state=1,max_depth=8)
model_dt_8.fit(x_train,y_train)

y_pred_8 = model_dt_8.predict(x_test)
accuracy_score_8 = accuracy_score(y_test,y_pred_8)
print('Accuracy Score for model with depth 8 is: ',accuracy_score_8)

Accuracy Score for model with depth 8 is:  0.8172861621137483


### Decision Tree with c4.5(entropy)

In [16]:

model_dt_ent = DecisionTreeClassifier(random_state = 1, max_depth=8,criterion='entropy')
model_dt_ent.fit(x_train,y_train)

y_pred_ent = model_dt_ent.predict(x_test)

accuracy_score_4 = accuracy_score(y_test,y_pred_ent)
print('Accuracy Score for model with depth 4 is: ',accuracy_score_4)

Accuracy Score for model with depth 4 is:  0.8119122257053292


### Classification Report

In [17]:
print(classification_report(y_test,y_pred_8))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82      1170
           1       0.79      0.84      0.81      1063

    accuracy                           0.82      2233
   macro avg       0.82      0.82      0.82      2233
weighted avg       0.82      0.82      0.82      2233



In [18]:
y_test

array([0, 0, 0, ..., 1, 1, 0])

In [19]:
y_pred_8

array([0, 1, 0, ..., 1, 1, 1])

## Area Under the Curve

In [21]:
#Find AUC using y_pred
fpr_dt,tpr_dt,_=roc_curve(y_test,y_pred_8)
roc_auc_dt = auc(fpr_dt,tpr_dt)
print(roc_auc_dt)
# print(accuracy_score(y_test,y_pred_8))

0.8182852111826712


In [20]:
#Find AUC using y_pred_proba
y_pred_proba = model_dt_ent.predict_proba(x_test)
fpr_dt,tpr_dt,threshold = roc_curve(y_test,y_pred_proba[:,1])    #y_pred_proba
roc_auc_dt = auc(fpr_dt,tpr_dt)
print(roc_auc_dt)
# print(accuracy_score(y_test,y_pred_8))

0.8688408069405247


## Graphical Represetation of Tree

In [22]:
from sklearn import tree
df1=pd.DataFrame(df.drop(['deposit'],axis=1))    #x or all predictor features
column_names=list(df1.columns)
column_names

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [8]:
from sklearn import tree
df1=pd.DataFrame(df.drop(['deposit'],axis=1))    #x or all predictor features
column_names=list(df1.columns)
fn=column_names    #list of features to be splitted (predictor variables)
cn=['Not deposited','deposited']   #class names or the categories of Target Variable

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (10,8), dpi=3000)

tree.plot_tree(model_dt_4,
           feature_names = fn, 
           class_names=cn,
           filled = True);

fig.savefig('DecisionTreeComplete1.png')

NameError: name 'pd' is not defined