In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

In [2]:
bank_df = pd.read_csv('bank.csv')
temp = bank_df.columns.values
temp

array(['age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration',
       'campaign', 'pdays', 'previous', 'poutcome', 'deposit'],
      dtype=object)

In [3]:
bank_df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [4]:
bank_df.select_dtypes(include=object).columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'deposit'],
      dtype='object')

In [5]:
#pre-processing
def preprocessor(df):
    le = preprocessing.LabelEncoder()
    proc_df = df.copy()
    for i in proc_df.select_dtypes(include=object).columns:
        proc_df[i] = le.fit_transform(proc_df[i])
    return proc_df
    

In [6]:
procdf = preprocessor(bank_df)

In [7]:
procdf.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [8]:
x = procdf.drop(['deposit'], axis=1).values
y = procdf['deposit'].values

In [9]:
x

array([[ 59,   0,   1, ...,  -1,   0,   3],
       [ 56,   0,   1, ...,  -1,   0,   3],
       [ 41,   9,   1, ...,  -1,   0,   3],
       ...,
       [ 32,   9,   2, ...,  -1,   0,   3],
       [ 43,   9,   1, ..., 172,   5,   0],
       [ 34,   9,   1, ...,  -1,   0,   3]])

In [10]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [11]:
#split into train test split
x_train, x_test, y_train, y_test =train_test_split(x,y, test_size=0.2)

In [12]:
#decision tree model - max_depth=2
model_dt_2 = DecisionTreeClassifier(random_state=7, max_depth=2)
model_dt_2.fit(x_train, y_train)
model_dt_2_score_train = model_dt_2.score(x_train, y_train)
print('training accuracy---',model_dt_2_score_train)
model_dt_2_score_test = model_dt_2.score(x_test, y_test)
print('testing accuracy---',model_dt_2_score_test)

training accuracy--- 0.7480120954194198
testing accuracy--- 0.741603224361845


In [13]:
#decision tree model - max_depth=4
model_dt_4 = DecisionTreeClassifier(random_state=7, max_depth=4)
model_dt_4.fit(x_train, y_train)
model_dt_4_score_train = model_dt_4.score(x_train, y_train)
print('training accuracy---',model_dt_4_score_train)
model_dt_4_score_test = model_dt_4.score(x_test, y_test)
print('testing accuracy---',model_dt_4_score_test)

training accuracy--- 0.7838503751819913
testing accuracy--- 0.7792207792207793


In [14]:
#decision tree model - max_depth=8
model_dt_8 = DecisionTreeClassifier(random_state=7,max_depth=8, criterion="gini")
model_dt_8.fit(x_train, y_train)
model_dt_8_score_train = model_dt_8.score(x_train, y_train)
print('training accuracy---',model_dt_8_score_train)
model_dt_8_score_test = model_dt_8.score(x_test, y_test)
print('testing accuracy---',model_dt_8_score_test)

training accuracy--- 0.8558629185799081
testing accuracy--- 0.8217644424540976


In [15]:
model_dt = DecisionTreeClassifier(max_depth = 8, criterion ="entropy")
model_dt.fit(x_train, y_train)
y_pred_dt = model_dt.predict_proba(x_test)[:, 1]
y_pred_dt

array([0.97142857, 0.94285714, 0.73333333, ..., 0.89527027, 0.41445783,
       0.        ])