In [1]:
import pandas as pd
fp = "../data/sba_training_resampled.csv"
df = pd.read_csv(fp)

In [2]:
preds = df.columns.tolist()
preds.remove("LoanStatus")

In [3]:
X = df[preds]
y = df["LoanStatus"]

In [4]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(3, 15)}
    # decision tree model
dtree_model=DecisionTreeClassifier()
    #use gridsearch to test all values
dtree_gscv = GridSearchCV(dtree_model, param_grid, cv=5)
    #fit model to data
dtree_gscv.fit(X, y)
dtree_gscv.best_params_

{'criterion': 'gini', 'max_depth': 3}

In [5]:
dtree_gscv.best_score_

0.8898113207547169

In [6]:
model = dtree_gscv.best_estimator_

In [7]:
fp_test = "../data/sba_test_dataset.csv"
df_test = pd.read_csv(fp)
Xt = df_test[preds]
yt = df_test["LoanStatus"]

In [8]:
yp = model.predict(Xt)

In [9]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(yt, yp)

0.8887195121951219

In [10]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(yt, yp))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.78      1.00      0.87      0.88      0.76      1312
        1.0       0.82      1.00      0.78      0.90      0.88      0.79      1338

avg / total       0.91      0.89      0.89      0.89      0.88      0.78      2650



In [11]:
c_10 = 169040.98 # false_positive_cost 
c_01 = 199617.34 # false_negative_ cost
pstar = c_01/(c_10 + c_01)
print('Threshold value of probability is : {:.{prec}f}'.format(pstar, prec=2))

Threshold value of probability is : 0.54


In [12]:
df_test["p_1"] = model.predict_proba(Xt)[:,1]

In [13]:
df_test["yp_opt"] = df_test["p_1"].apply(lambda x: 1 if x > pstar else 0)

In [14]:
print(classification_report_imbalanced(yt, df_test["yp_opt"]))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.78      1.00      0.87      0.88      0.76      1312
        1.0       0.82      1.00      0.78      0.90      0.88      0.79      1338

avg / total       0.91      0.89      0.89      0.89      0.88      0.78      2650



In [15]:
feat_importance = model.tree_.compute_feature_importances(normalize=True)
print("feat importance = " + str(feat_importance))

feat importance = [0.         0.         0.0082509  0.         0.         0.06322006
 0.92852904]


In [16]:
[(preds[x], feat_importance[x]) for x in np.argsort(-feat_importance)]

[('D_BorrZip_ND', 0.9285290389654883),
 ('D_BankZip_ND', 0.063220063094055),
 ('TermInMonths', 0.008250897940456716),
 ('JobsSupported', 0.0),
 ('InitialInterestRate', 0.0),
 ('GrossApproval', 0.0),
 ('SBAGuaranteedApproval', 0.0)]