In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report

In [12]:
# Loading data
df = pd.read_csv('bank.csv', sep=';')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [13]:
# Data Pre-Processing
def balanceator(x):
    if x < 75:
        return 'Class E'
    elif x >= 75 and x < 448:
        return 'Class D'
    elif x >= 448 and x < 1428:
        return 'Class C'
    elif x >= 1428 and x < df['balance'].quantile(0.99):
        return 'Class B'
    else:
        return 'Class A'

In [16]:
def wrangler(path):
    df = pd.read_csv(path,sep=';') # To read the csv file
    df['y'] = df['y'].apply(lambda x: True if x== 'yes'  else False) # Change yes or no to True or False
    df['default'] = df['default'].apply(lambda x: True if x == 'yes' else False) 
    df['balance'] = df['balance'].apply(lambda x:balanceator(x))
    df['housing'] = df['housing'].apply(lambda x: True if x == 'yes' else False)
    df['loan'] = df['loan'].apply(lambda x: True if x == 'yes' else False)
    df['previous'] = df['previous'].apply(lambda x: True if x != 0 else False) # Change object output to bool for visualization and modeling purpuses
    
    #drop columns:
    to_drop =['day', 'poutcome', 'pdays'] 
    df.drop(columns= to_drop, inplace=True)
    
    
    return df

In [18]:
df_new = wrangler('bank.csv')
df_new.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,month,duration,campaign,previous,y
0,58,management,married,tertiary,False,Class B,True,False,unknown,may,261,1,False,False
1,44,technician,single,secondary,False,Class E,True,False,unknown,may,151,1,False,False
2,33,entrepreneur,married,secondary,False,Class E,True,True,unknown,may,76,1,False,False
3,47,blue-collar,married,unknown,False,Class B,True,False,unknown,may,92,1,False,False
4,33,unknown,single,unknown,False,Class E,False,False,unknown,may,198,1,False,False


In [19]:
X = df_new.drop(columns=['duration','y'])
y = df_new['y']

In [20]:
oe = OrdinalEncoder()
X = oe.fit_transform(X)

In [22]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

In [23]:
dt = GridSearchCV(DecisionTreeClassifier(random_state=42), {}, n_jobs=-1, cv=10, refit="recall")
dt.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,{}
,scoring,
,n_jobs,-1
,refit,'recall'
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [25]:
pred = dt.predict(X_test)

In [27]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       False       0.90      0.90      0.90      5989
        True       0.27      0.27      0.27       793

    accuracy                           0.83      6782
   macro avg       0.58      0.59      0.59      6782
weighted avg       0.83      0.83      0.83      6782



In [28]:
params_dt = {
    "max_depth": [5, 10, 15, 20, 25, 30, None], # Maximum depth of the decision tree
    "criterion": ["gini","entropy"], # The quality criterion to measure the information gain when splitting nodes
    "min_samples_split": [2,3], # Minimum number of samples required to split an internal node
    "min_samples_leaf": [1,2] # Minimum number of samples required to be at a leaf node
}

In [29]:
model_dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42), # Define the Decision Tree model
    params_dt, # Pass in the hyperparameters to be tuned from the dictionary we defined earlier
    cv=10, # Set the number of folds for cross-validation
    verbose=2
)

In [30]:
model_dt.fit(X_train, y_train)


Fitting 10 folds for each of 56 candidates, totalling 560 fits
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 3]}"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [32]:
pred_dt = model_dt.predict(X_test)

In [33]:
print (classification_report(pred_dt, y_test))

              precision    recall  f1-score   support

       False       0.99      0.89      0.94      6625
        True       0.09      0.48      0.16       157

    accuracy                           0.88      6782
   macro avg       0.54      0.68      0.55      6782
weighted avg       0.97      0.88      0.92      6782

