In [1]:
## ======================================================================
#            Importing the necessary modules and tools
## ======================================================================

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


# from sklearn.metrics import necessary metrics
from sklearn.metrics import mean_squared_error as MSE


# Set notebook options
# --------------------
pd.options.display.float_format = '{:,.3f}'.format
%matplotlib inline

In [2]:
url = 'https://raw.githubusercontent.com/DrSaadLa/PythonTuts/main/TreeBasedModels/loan_data.csv'
df = pd.read_csv(url)

In [3]:
df['credit.policy'].unique()

array([1, 0])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
credit.policy,9578.0,0.805,0.396,0.0,1.0,1.0,1.0,1.0
int.rate,9578.0,0.123,0.027,0.06,0.104,0.122,0.141,0.216
installment,9578.0,319.089,207.071,15.67,163.77,268.95,432.762,940.14
log.annual.inc,9578.0,10.932,0.615,7.548,10.558,10.929,11.291,14.528
dti,9578.0,12.607,6.884,0.0,7.212,12.665,17.95,29.96
fico,9578.0,710.846,37.971,612.0,682.0,707.0,737.0,827.0
days.with.cr.line,9578.0,4560.767,2496.93,178.958,2820.0,4139.958,5730.0,17639.958
revol.bal,9578.0,16913.964,33756.19,0.0,3187.0,8596.0,18249.5,1207359.0
revol.util,9578.0,46.799,29.014,0.0,22.6,46.3,70.9,119.0
inq.last.6mths,9578.0,1.577,2.2,0.0,0.0,1.0,2.0,33.0


In [6]:
# checkking missing value
print(df.isnull().any())

credit.policy        False
purpose              False
int.rate             False
installment          False
log.annual.inc       False
dti                  False
fico                 False
days.with.cr.line    False
revol.bal            False
revol.util           False
inq.last.6mths       False
delinq.2yrs          False
pub.rec              False
not.fully.paid       False
dtype: bool


In [7]:
sns.pairplot(data=df, hue='credit.policy', palette='crest')
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [8]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
df['purpose']=LabelEncoder().fit_transform(df['purpose'])

In [9]:
# Features
X = df.drop('credit.policy', axis = 1)
# Target
y = df['credit.policy']


In [10]:
# import the functions train_test_split() from sklearn.model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 , random_state= 1,stratify= y)

In [11]:

# import DecisionTreeClassifier from sklearn.tree 
from sklearn.tree import DecisionTreeClassifier

# from sklearn.metrics import necessary metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [12]:
dtree = DecisionTreeClassifier(criterion='entropy', 
                               random_state=10)
dtree.fit(X_train, y_train)
preds = dtree.predict(X_test)

In [13]:
## =======================================================
#           Tune the decision Tree hyperparameter
## =======================================================

# Import roc_auc_score from sklearn.metrics 
from sklearn.metrics import roc_auc_score

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define param_grid

params_dt = {
    'max_depth': [2, 3, 4, 6],
    'min_samples_leaf':[0.08,0.1, .12, 0.14, .16, .18],
    }

# Instantiate grid_dt
grid_dt = GridSearchCV(estimator = dtree, 
                       param_grid= params_dt,
                       scoring='roc_auc',
                       cv= 10,
                       n_jobs= 8)


# Fit the GridSearchCV object
grid_dt.fit(X_train, y_train)

# Extract best hyperparameters from 'grid_dt' and print them
best_hyperparams = grid_dt.best_params_

print('Best hyerparameters:\n'.center(40))
print("="*50)
print(best_hyperparams)
print("*"*50)

# Extract best CV score from 'grid_dt'
best_CV_score = grid_dt.best_score_

print('Best CV accuracy {}'.format(best_CV_score))


# Extract best model from 'grid_dt'
best_model = grid_dt.best_estimator_

print("*"*50)
print("The best estimator is:")
print("-"*len("The best estimator is:"))
print(best_model)
print("*"*50)

# Predict the test set probabilities of the positive class
y_pred_proba = best_model.predict_proba(X_test)[:,1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))
print("="*50)

         Best hyerparameters:
          
{'max_depth': 6, 'min_samples_leaf': 0.08}
**************************************************
Best CV accuracy 0.9334411584114418
**************************************************
The best estimator is:
----------------------
DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=0.08,
                       random_state=10)
**************************************************
Test set ROC AUC score: 0.936


In [14]:
## ============================================================
#            Training another Model
## ============================================================

dtree2 = DecisionTreeClassifier(criterion='gini', 
                               random_state=10)
dtree2.fit(X_train, y_train)

# Define param_grid
params_dt = {
    'max_depth': np.arange(1,15),
    'min_samples_leaf':np.linspace(0.01, 0.2, 20)
    }

# Instantiate grid_dt
grid_dt2 = GridSearchCV(estimator = dtree2, 
                       param_grid= params_dt,
                       scoring='accuracy',
                       cv= 10,
                       n_jobs= 8)


# Fit the GridSearchCV object
grid_dt2.fit(X_train, y_train)

# Extract best hyperparameters from 'grid_dt' and print them
best_hyperparams2 = grid_dt2.best_params_

print('Best hyerparameters:\n'.center(40))
print("="*50)
print(best_hyperparams2)
print("*"*50)

# Extract best CV score from 'grid_dt'
best_CV_score2 = grid_dt2.best_score_

print('Best CV accuracy {}'.format(best_CV_score2))


# Extract best model from 'grid_dt'
best_model2 = grid_dt2.best_estimator_

print("*"*50)
print("The best estimator is:")
print("-"*len("The best estimator is:"))
print(best_model2)
print("*"*50)

# Predict the test set probabilities of the positive class
preds2 = best_model2.predict(X_test)

# Compute test_roc_auc
test_auc2 = accuracy_score(y_test, preds2)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_auc2))
print("="*50)

         Best hyerparameters:
          
{'max_depth': 5, 'min_samples_leaf': 0.01}
**************************************************
Best CV accuracy 0.9812053740240673
**************************************************
The best estimator is:
----------------------
DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.01, random_state=10)
**************************************************
Test set ROC AUC score: 0.981


In [15]:
# from sklearn.metrics import necessary metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print(classification_report(y_test,preds2))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       561
           1       0.99      0.99      0.99      2313

    accuracy                           0.98      2874
   macro avg       0.97      0.97      0.97      2874
weighted avg       0.98      0.98      0.98      2874

