In [1]:
from cart import DecisionTreeCART
import numpy as np
import pandas as pd

In [None]:
borrower_df = pd.read_csv('data/Borrower.csv')
loan_df = pd.read_csv('data/Loan.csv')

loan_feature = loan_df[['loanId', 'memberId', 'isJointApplication',
                        'loanAmount', 'term', 'interestRate',
                        'monthlyPayment', 'grade', 'loanStatus']]
loan_feature.set_index('loanId')

borrower_feature = borrower_df[['memberId', 'yearsEmployment',
                                'homeOwnership', 'annualIncome',
                                'dtiRatio', 'lengthCreditHistory',
                                'numTotalCreditLines',
                                'numOpenCreditLines',
                                'numOpenCreditLines1Year',
                                'revolvingBalance',
                                'revolvingUtilizationRate',
                                'numDerogatoryRec',
                                'numDelinquency2Years',
                                'numChargeoff1year',
                                'numInquiries6Mon']]
borrower_feature.set_index('memberId')

merged_df = pd.merge(borrower_feature, loan_feature, on='memberId')
merged_df.sort_values(['memberId', 'loanId'], ascending=True)
merged_df = merged_df.set_index(['loanId', 'memberId'])

mode_jointapp = merged_df.isJointApplication.mode()[0]
mean_loan = round(merged_df.loanAmount.mean())
mode_term = merged_df.term.mode()[0]
mean_numopen = round(merged_df.numOpenCreditLines.mean())

merged_df[["isJointApplication"]] = merged_df[[
    "isJointApplication"]].fillna(value=mode_jointapp)
merged_df[["loanAmount"]] = merged_df[[
    "loanAmount"]].fillna(value=mean_loan)
merged_df[["term"]] = merged_df[["term"]].fillna(value=mode_term)
merged_df[["numOpenCreditLines"]] = merged_df[[
    "numOpenCreditLines"]].fillna(value=mean_numopen)

In [None]:
replace_dict = {
    'yearsEmployment': {
        '< 1 year': 0.5,
        '1 year': 1,
        '2-5 years': 3,
        '6-9 years': 8,
        '10+ years': 12,
    },
    'homeOwnership': {
        'rent': 0,
        'mortgage': 1,
        'own': 2,
    },
    'loanStatus': {
        'Default': 0,
        'Current': 1,
    },
    'term': {
        '60 months': 60,
        '48 months': 48,
        '36 months': 36,
    },
    'grade': {
        'A1': 1,
        'A2': 2,
        'A3': 3,
        'B1': 4,
        'B2': 5,
        'B3': 6,
        'C1': 7,
        'C2': 8,
        'C3': 9,
        'D1': 10,
        'D2': 11,
        'D3': 12,
        'E1': 13,
        'E2': 14,
        'E3': 15,
    }
}

cleaned_df = merged_df.replace(replace_dict)
cleaned_df.to_csv('data/loan_cleaned_unbalanced.csv')

In [None]:
class_yes = cleaned_df[cleaned_df['loanStatus'] == 1]
class_no = cleaned_df[cleaned_df['loanStatus'] == 0]

n = round(len(class_no) * 1.5)
balanced_df = class_no.append(class_yes.sample(n))
balanced_df.to_csv('data/loan_cleaned_balanced_under_sampling.csv')

In [None]:
class_yes = cleaned_df[cleaned_df['loanStatus'] == 1]
class_no = cleaned_df[cleaned_df['loanStatus'] == 0]

n = round(len(class_yes) / len(class_no))
over_sampling_df = class_yes.append(class_no)

for i in range(n-2):
    over_sampling_df = over_sampling_df.append(class_no)

over_sampling_df.to_csv('data/loan_cleaned_balanced_over_sampling.csv')

# Model Training

In [2]:
dataset_df = pd.read_csv('data/loan_cleaned_unbalanced.csv').set_index(['loanId', 'memberId'])
dataset_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,yearsEmployment,homeOwnership,annualIncome,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,revolvingBalance,revolvingUtilizationRate,...,numDelinquency2Years,numChargeoff1year,numInquiries6Mon,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
loanId,memberId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1888978,2305095,12.0,0,56471,16.8,6,11,9.0,6,14301,49.02,...,19,10,0,0.0,25190.0,60,6.25,490,15,1
1299695,2610493,3.0,0,55038,19.99,22,8,7.0,4,18262,72.4,...,0,0,0,0.0,21189.0,60,10.49,455,6,1
1875016,2491679,0.5,0,56610,14.33,5,8,5.0,5,10799,66.27,...,1,1,0,0.0,29908.0,60,9.11,622,5,1
1440478,2092798,8.0,2,54887,14.8,12,14,7.0,3,15272,61.05,...,0,0,3,0.0,13053.0,48,11.89,343,6,1
1124634,2633077,3.0,0,53522,10.14,4,21,19.0,10,19316,56.39,...,14,7,1,0.0,24613.0,60,15.13,587,3,1


In [3]:
dataset_np = dataset_df.sample(1500).values
train_size = round(0.9 * len(dataset_np))
np.random.shuffle(dataset_np)
train_np = np.array(dataset_np[:train_size,:])
test_np = np.array(dataset_np[train_size:,:])


# train_df = dataset_df.sample(10000)
# train_np = np.array(train_df.values)
# test_np = np.array(dataset_df.sample(1000).values)

X_train = train_np[:, :-1]
y_train = train_np[:, len(train_np[0]) - 1]

X_test = test_np[:, :-1]
y_test = test_np[:, len(train_np[0]) - 1]

In [4]:
clf = DecisionTreeCART()
clf.fit(X_train, y_train, max_depth=6, min_size=10)

In [5]:
predicted = clf.predict(X_test)

pred_true = 0
tn = 0
tp = 0
fn = 0
fp = 0
for i in range(len(predicted)):
    if predicted[i] == y_test[i]:
        pred_true += 1
        if y_test[i] == 0:
            tn += 1
        else:
            tp += 1
    else:
        if y_test[i] == 1:
            fn += 1
        else:
            fp += 1

AttributeError: 'numpy.float64' object has no attribute 'index'

In [None]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print('Predicted True = ' + str(pred_true) + ' out of ' + str(len(y_test)))
print('Precision      = ', precision)
print('Recall         = ', recall)
print('True Positive  = ', tp)
print('True Negative  = ', tn)
print('False Positive = ', fp)
print('False Negative = ', fn)
print('F1 Score       = ', f1)