In [2]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
import math
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score
import category_encoders as ce

In [4]:
os.chdir(r'C:\Users\salil\Desktop')

In [5]:
credit = pd.read_excel('CreditData_RareEvent.xlsx')

In [6]:
credit.head()

Unnamed: 0,good_bad,age,amount,duration,checking,coapp,depends,employed,existcr,foreign,history,housing,installp,job,marital,other,property,resident,savings,telephon
0,good,67,1169,6,1,1,1,5,2,1,4,2,4,3,3,3,1,4,5,2
1,good,67,1169,6,1,1,1,5,2,1,4,2,4,3,3,3,1,4,5,2
2,good,67,1169,6,1,1,1,5,2,1,4,2,4,3,3,3,1,4,5,2
3,good,67,1169,6,1,1,1,5,2,1,4,2,4,3,3,3,1,4,5,2
4,good,67,1169,6,1,1,1,5,2,1,4,2,4,3,3,3,1,4,5,2


In [7]:
credit.tail()

Unnamed: 0,good_bad,age,amount,duration,checking,coapp,depends,employed,existcr,foreign,history,housing,installp,job,marital,other,property,resident,savings,telephon
10495,bad,49,8386,30,2,1,1,4,1,1,4,2,2,3,3,3,2,2,1,1
10496,bad,33,4844,48,4,1,1,1,1,1,2,1,3,4,3,1,3,2,1,2
10497,bad,33,4844,48,4,1,1,1,1,1,2,1,3,4,3,1,3,2,1,2
10498,bad,26,8229,36,1,1,2,3,1,1,2,2,2,3,3,3,2,2,1,1
10499,bad,26,8229,36,1,1,2,3,1,1,2,2,2,3,3,3,2,2,1,1


In [11]:
#Let's check the missing values in each column of the data frame
credit.isnull().sum()

good_bad    0
age         0
amount      0
duration    0
checking    0
coapp       0
depends     0
employed    0
existcr     0
foreign     0
history     0
housing     0
installp    0
job         0
marital     0
other       0
property    0
resident    0
savings     0
telephon    0
dtype: int64

In [12]:
#There are no missing values in the whole data frame

In [15]:
#let us check instances of "good" & "bad" in the good_bad column
(credit['good_bad'] == "good").sum()

10000

In [16]:
(credit['good_bad'] == "bad").sum()

500

In [17]:
#500 / 10000 = 5% of the total instances are "bad"
#If we fit a regular classification model, this will lead to Python recognizing "bad" as errors
#So we need some modification to the model and thus we use Random Undersampling

In [18]:
#Before proceesing to the model, we need to clean the data
#We map and substitute "good" for 1 and "bad" for 0
mapping={'good':1,'bad':0}
credit['good_bad']=credit['good_bad'].map(mapping)

In [21]:
#Now, except for "age" & "duration", all others are categorical variables. 
#So, we encode these variables and then proceed to fit out model
ce_onehot=ce.OneHotEncoder(cols=['checking','coapp','depends',\
                                  'employed','existcr','foreign','history','housing',\
                                  'installp','job','marital','other','property',\
                                  'resident','savings','telephon'])

x=credit.drop('good_bad',axis=1)
y=np.asarray(credit['good_bad'])
X=np.asarray(ce_onehot.fit_transform(x))

In [22]:
#Let's assign cost for False Positives & False Negatives 
fp_cost = np.array(credit['amount'])
fn_cost = np.array(0.15*credit['amount'])

In [23]:
score_list = ['accuracy', 'recall', 'precision', 'f1']
search_depths = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [25]:
#Let us fit a Decision Tree Classifier to the Dataframe
for d in search_depths:
    dtc1=DecisionTreeClassifier(criterion='gini',max_depth=d,min_samples_split=5,\
                                                min_samples_leaf=5)
    mean_score=[]
    std_score=[]
    print("max_depth",d)
    print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev."))
    for s in score_list:
        dtc_10=cross_val_score(dtc1,X,y,scoring=s,cv=10)
        mean = dtc_10.mean()
        std = dtc_10.std()
        mean_score.append(mean)
        std_score.append(std)
        print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std))


max_depth 2
Metric.......  Mean    Std. Dev.
accuracy..... 0.9492    0.0052
recall....... 0.9957    0.0053
precision.... 0.9531    0.0016
f1........... 0.9739    0.0027
max_depth 3
Metric.......  Mean    Std. Dev.
accuracy..... 0.9403    0.0161
recall....... 0.9890    0.0108
precision.... 0.9534    0.0020
f1........... 0.9709    0.0053
max_depth 4
Metric.......  Mean    Std. Dev.
accuracy..... 0.9382    0.0161
recall....... 0.9837    0.0123
precision.... 0.9544    0.0027
f1........... 0.9688    0.0064
max_depth 5
Metric.......  Mean    Std. Dev.
accuracy..... 0.9276    0.0214
recall....... 0.9730    0.0183
precision.... 0.9548    0.0027
f1........... 0.9622    0.0116
max_depth 6
Metric.......  Mean    Std. Dev.
accuracy..... 0.9218    0.0163
recall....... 0.9651    0.0164
precision.... 0.9557    0.0033
f1........... 0.9598    0.0089
max_depth 7
Metric.......  Mean    Std. Dev.
accuracy..... 0.9130    0.0248
recall....... 0.9548    0.0262
precision.... 0.9563    0.0033
f1........... 0.9

In [26]:
#The tree with depth 2 has the best F-1 score.

In [27]:
np.random.seed(12345)
max_seed = 2**32 - 1
rand_val = np.random.randint(1, high=max_seed, size=20,dtype=np.int64)
# Ratios of Majority:Minority Events
ratio = [ '50:50', '60:40', '70:30','75:25', '80:20', '85:15' ]
# Dictionaries contains number of minority and majority
# events in each ratio sample where n_majority = ratio x n_minority
rus_ratio = ({0:500, 1:500}, {0:500, 1:750}, {0:500, 1:1167}, \
             {0:500, 1:2000}, {0:500, 1:4500})
search_depths = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
min_loss = 1e64

In [32]:
def binary_loss(y, y_predict, fp_cost, fn_cost, display=True):
        loss     = [0, 0]       #False Neg Cost, False Pos Cost
        conf_mat = [0, 0, 0, 0] #tn, fp, fn, tp
        for j in range(len(y)):
            if y[j]==0:
                if y_predict[j]==0:
                    conf_mat[0] += 1 #True Negative
                else:
                    conf_mat[1] += 1 #False Positive
                    loss[1] += fp_cost[j]
            else:
                if y_predict[j]==1:
                    conf_mat[3] += 1 #True Positive
                else:
                    conf_mat[2] += 1 #False Negative
                    loss[0] += fn_cost[j]
        if display:
            fn_loss = loss[0]
            fp_loss = loss[1]
            total_loss = fn_loss + fp_loss
            misc    = conf_mat[1] + conf_mat[2]
            misc    = misc/len(y)
            print("{:.<23s}{:10.4f}".format("Misclassification Rate", misc))
            print("{:.<23s}{:10.0f}".format("False Negative Loss", fn_loss))
            print("{:.<23s}{:10.0f}".format("False Positive Loss", fp_loss))
            print("{:.<23s}{:10.0f}".format("Total Loss", total_loss))
        return loss, conf_mat

In [33]:
for k in range(len(rus_ratio)):
    print("\nDecision Tree Model using " + ratio[k] + " RUS")
    min_loss_c = 1e64
    for j in range(len(search_depths)):
        c=search_depths[j]
        fn_loss = np.zeros(len(rand_val))
        fp_loss = np.zeros(len(rand_val))
        misc = np.zeros(len(rand_val))
        for i in range(len(rand_val)):
            rus = RandomUnderSampler(ratio=rus_ratio[k], \
                                     random_state=rand_val[i], 
                                     return_indices=False, \
                                     replacement=False)
            X_rus, y_rus = rus.fit_sample(X, y)
            dtc2=DecisionTreeClassifier(criterion='gini',\
                                        max_depth=d,\
                                        min_samples_split = 5,\
                                        min_samples_leaf=5)
            dtc2.fit(X_rus,y_rus)
            loss, conf_mat = binary_loss(y, dtc2.predict(X), \
                                                   fp_cost, fn_cost, display=False)
            fn_loss[i] = loss[0]
            fp_loss[i] = loss[1]
            misc[i] = (conf_mat[1] + conf_mat[2])/y.shape[0]
    avg_misc = np.average(misc)
    t_loss = fp_loss+fn_loss
    avg_loss = np.average(t_loss)
    if avg_loss < min_loss_c:
        min_loss_c = avg_loss
        se_loss_c = np.std(t_loss)/math.sqrt(len(rand_val))
        best_c = c
        misc_c = avg_misc
        fn_avg_loss = np.average(fn_loss)
        fp_avg_loss = np.average(fp_loss)
    if min_loss_c < min_loss:
        min_loss = min_loss_c
        se_loss = se_loss_c
        best_ratio = k
        best_reg = best_c
    print("{:.<23s}{:12.2E}".format("Best Depth", best_c))
    print("{:.<23s}{:12.4f}".format("Misclassification Rate",misc_c))
    print("{:.<23s} ${:10,.0f}".format("False Negative Loss",fn_avg_loss))
    print("{:.<23s} ${:10,.0f}".format("False Positive Loss",fp_avg_loss))
    print("{:.<23s} ${:10,.0f}{:5s}${:<,.0f}".format("Total Loss", min_loss_c, " +/- ", se_loss_c))
print("{:.<23s}{:>12s}".format("Best RUS Ratio", ratio[best_ratio]))
print("{:.<23s}{:12.2E}".format("Best Depth", best_reg))


Decision Tree Model using 50:50 RUS
Best Depth.............    2.00E+01
Misclassification Rate.      0.2143
False Negative Loss.... $ 1,122,912
False Positive Loss.... $   121,840
Total Loss............. $ 1,244,752 +/- $26,943

Decision Tree Model using 60:40 RUS
Best Depth.............    2.00E+01
Misclassification Rate.      0.1434
False Negative Loss.... $   750,033
False Positive Loss.... $   163,814
Total Loss............. $   913,848 +/- $18,969

Decision Tree Model using 70:30 RUS
Best Depth.............    2.00E+01
Misclassification Rate.      0.0923
False Negative Loss.... $   473,840
False Positive Loss.... $   183,431
Total Loss............. $   657,271 +/- $15,534

Decision Tree Model using 75:25 RUS
Best Depth.............    2.00E+01
Misclassification Rate.      0.0546
False Negative Loss.... $   273,551
False Positive Loss.... $   222,416
Total Loss............. $   495,968 +/- $15,236

Decision Tree Model using 80:20 RUS
Best Depth.............    2.00E+01
Misclassifi

In [34]:
#Best Depth is 2 
#Lowest misclassification error is for 80:20 split model


In [35]:
#Let us try an Ensemble Model
n_obs = len(y)
n_rand = 100
predicted_prob = np.zeros((n_obs,n_rand))
avg_prob = np.zeros(n_obs)
# Setup 100 random number seeds for use in creating random samples
np.random.seed(12345)
max_seed = 2**32 - 1
rand_value = np.random.randint(1, high=max_seed, size=n_rand,dtype=np.int64)
for i in range(len(search_depths)):
    rus = RandomUnderSampler(ratio=rus_ratio[best_ratio], \
                             random_state=rand_value[i],\
                             return_indices=False,\
                             replacement=False)
    X_rus, y_rus = rus.fit_sample(X, y)
    dtc3=DecisionTreeClassifier(criterion='gini',\
                                max_depth=d,\
                                min_samples_split=5,\
                                min_samples_leaf=5)
    dtc3.fit(X_rus,y_rus)
    predicted_prob[0:n_obs, i] = dtc3.predict_proba(X)[0:n_obs, 0]
for i in range(n_obs):
    avg_prob[i] = np.mean(predicted_prob[i,0:n_rand])
    y_pred = avg_prob[0:n_obs] < 0.5
    y_pred.astype(np.int)
# Calculate loss from using the ensemble predictions
print("\nEnsemble Estimates based on averaging",len(rand_value), "Models")
loss, conf_mat = binary_loss(y, y_pred, fp_cost, fn_cost)


Ensemble Estimates based on averaging 100 Models
Misclassification Rate.    0.0476
False Negative Loss....         0
False Positive Loss....   2096550
Total Loss.............   2096550


In [36]:
#MISCLASSIFICATION RATE INCREASES BY 3% compared to the 80:20 model previously 
#THE TOTAL LOSS WAS UNDERESTIMATED WITHOUT ENSEMBLE.
#THE FP LOSS DOMINATES THE TOTAL LOSS.