In [1]:
!pip install graphviz # reliable, basic tool for tree viz 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline 
import graphviz 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz # Classifier since response variable is categorical
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score 
! pip install dmba
from dmba import classificationSummary

Collecting dmba
  Using cached dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Using cached dmba-0.2.4-py3-none-any.whl (11.8 MB)
Installing collected packages: dmba
Successfully installed dmba-0.2.4
no display found. Using non-interactive Agg backend


In [2]:
xlsx = pd.ExcelFile('UniversalBank.xlsx')
bank_df = pd.read_excel(xlsx, 'Data') # reading a specific worksheet from the excel file
bank_df.info()
bank_df.columns = bank_df.columns.str.replace(' ','') # replaces spaces in the column headers 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Mortgage            5000 non-null   int64  
 8   Personal Loan       5000 non-null   int64  
 9   Securities Account  5000 non-null   int64  
 10  CD Account          5000 non-null   int64  
 11  Online              5000 non-null   int64  
 12  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(12)
memory usage: 507.9 KB


In [3]:
bank_df.drop(columns = ['ID','ZIPCode'], inplace=True) #no need to overwrite df...just replace memory 
X = bank_df.drop(columns=['PersonalLoan'])
y = bank_df['PersonalLoan'] # pandas series created (only one column)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [5]:
fullClassTree = DecisionTreeClassifier(random_state=1) # decision tree is deterministic so why random state? 
# this SHUFFLES the features...if features in a particular order then selected in that order
# so same rand_state to ensure that the split is the same 

In [6]:
fullClassTree.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [7]:
y_predicted = fullClassTree.predict(X_test)

In [8]:
accuracy_score(y_test, y_predicted)

0.96

In [9]:
classificationSummary(y_train, fullClassTree.predict(X_train)) # likely case of OVERFITTING 

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 4064    0
     1    0  436


In [10]:
print(fullClassTree.tree_.max_depth)
print(fullClassTree.tree_.node_count) # BIG TREE
print(fullClassTree.get_n_leaves())

27
395
198


In [11]:
classificationSummary(y_test,fullClassTree.predict(X_test))

Confusion Matrix (Accuracy 0.9600)

       Prediction
Actual   0   1
     0 448   8
     1  12  32


In [12]:
fullClassTree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 1,
 'splitter': 'best'}

In [13]:
feature_names = list(X.columns)
text_representation = tree.export_text(fullClassTree, feature_names=feature_names)
print(text_representation)

|--- Income <= 113.50
|   |--- CCAvg <= 2.95
|   |   |--- Income <= 106.50
|   |   |   |--- class: 0
|   |   |--- Income >  106.50
|   |   |   |--- Family <= 3.50
|   |   |   |   |--- Experience <= 12.50
|   |   |   |   |   |--- Age <= 34.00
|   |   |   |   |   |   |--- Age <= 28.50
|   |   |   |   |   |   |   |--- Age <= 27.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- Age >  27.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- Age >  28.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- Age >  34.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- Experience >  12.50
|   |   |   |   |   |--- CCAvg <= 1.25
|   |   |   |   |   |   |--- CCAvg <= 1.05
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- CCAvg >  1.05
|   |   |   |   |   |   |   |--- Income <= 110.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- Income >  110.00
|   |   |   |   | 

In [14]:
list_int = list(y.unique())
class_names = list(map(str,list_int))

In [15]:
fig = plt.figure(figsize=(25,20))
tree.plot_tree(fullClassTree, feature_names=feature_names,class_names=class_names,
              rounded=True,filled=True)
plt.show()

In [16]:
dot_data = export_graphviz(fullClassTree,out_file='fullClassTree.dot',
                          feature_names=feature_names,class_names=class_names,
                          rounded=True,filled=True)

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
dt = DecisionTreeClassifier(random_state=42)

In [20]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [21]:
grid = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=1)

In [22]:
grid.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [None, 3, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best cross-validation accuracy: 0.95525


In [24]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Test accuracy: 0.954
