# Preprocessing

In [1]:
from random import seed
from random import randrange
from csv import reader
import pandas as pd
import numpy as np

In [2]:
borrower_df = pd.read_csv('data/Borrower.csv')
loan_df = pd.read_csv('data/Loan.csv')

In [3]:
loan_feature = loan_df[['loanId', 'memberId', 'isJointApplication',\
                   'loanAmount', 'term', 'interestRate', 'monthlyPayment', 'grade', 'loanStatus']]
loan_feature.set_index('loanId')
loan_feature.head()

Unnamed: 0,loanId,memberId,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
0,1888978,2305095,0.0,25190.0,60 months,6.25,490,E3,Current
1,1299695,2610493,0.0,21189.0,60 months,10.49,455,B3,Current
2,1875016,2491679,0.0,29908.0,60 months,9.11,622,B2,Current
3,1440478,2092798,0.0,13053.0,48 months,11.89,343,B3,Current
4,1124634,2633077,0.0,24613.0,60 months,15.13,587,A3,Current


In [4]:
borrower_feature = borrower_df[['memberId', 'yearsEmployment',\
                                'homeOwnership', 'annualIncome', \
                                'dtiRatio', 'lengthCreditHistory', 'numTotalCreditLines',\
                                'numOpenCreditLines', 'numOpenCreditLines1Year',\
                                'revolvingBalance', 'revolvingUtilizationRate',\
                                'numDerogatoryRec', 'numDelinquency2Years',\
                                'numChargeoff1year', 'numInquiries6Mon']]
borrower_feature.set_index('memberId')
borrower_feature.head()

Unnamed: 0,memberId,yearsEmployment,homeOwnership,annualIncome,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,revolvingBalance,revolvingUtilizationRate,numDerogatoryRec,numDelinquency2Years,numChargeoff1year,numInquiries6Mon
0,2305095,10+ years,rent,56471,16.8,6,11,9.0,6,14301,49.02,0,19,10,0
1,2610493,2-5 years,rent,55038,19.99,22,8,7.0,4,18262,72.4,1,0,0,0
2,2491679,< 1 year,rent,56610,14.33,5,8,5.0,5,10799,66.27,0,1,1,0
3,2092798,6-9 years,own,54887,14.8,12,14,7.0,3,15272,61.05,1,0,0,3
4,2633077,2-5 years,rent,53522,10.14,4,21,19.0,10,19316,56.39,2,14,7,1


In [5]:
merged_df = pd.merge(borrower_feature, loan_feature, on='memberId')
merged_df.sort_values(['memberId', 'loanId'], ascending=True)
merged_df = merged_df.set_index(['loanId', 'memberId'])
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,yearsEmployment,homeOwnership,annualIncome,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,revolvingBalance,revolvingUtilizationRate,...,numDelinquency2Years,numChargeoff1year,numInquiries6Mon,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
loanId,memberId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1888978,2305095,10+ years,rent,56471,16.8,6,11,9.0,6,14301,49.02,...,19,10,0,0.0,25190.0,60 months,6.25,490,E3,Current
1299695,2610493,2-5 years,rent,55038,19.99,22,8,7.0,4,18262,72.4,...,0,0,0,0.0,21189.0,60 months,10.49,455,B3,Current
1875016,2491679,< 1 year,rent,56610,14.33,5,8,5.0,5,10799,66.27,...,1,1,0,0.0,29908.0,60 months,9.11,622,B2,Current
1440478,2092798,6-9 years,own,54887,14.8,12,14,7.0,3,15272,61.05,...,0,0,3,0.0,13053.0,48 months,11.89,343,B3,Current
1124634,2633077,2-5 years,rent,53522,10.14,4,21,19.0,10,19316,56.39,...,14,7,1,0.0,24613.0,60 months,15.13,587,A3,Current


In [6]:
mode_jointapp = merged_df.isJointApplication.mode()[0]
mean_loan = round(merged_df.loanAmount.mean())
mode_term = merged_df.term.mode()[0]
mean_numopen = round(merged_df.numOpenCreditLines.mean())

In [7]:
merged_df[["isJointApplication"]] = merged_df[["isJointApplication"]].fillna(value=mode_jointapp)
merged_df[["loanAmount"]] = merged_df[["loanAmount"]].fillna(value=mean_loan)
merged_df[["term"]] = merged_df[["term"]].fillna(value=mode_term)
merged_df[["numOpenCreditLines"]] = merged_df[["numOpenCreditLines"]].fillna(value=mean_numopen)

In [8]:
null_columns = merged_df.columns[merged_df.isnull().any()].tolist()
null_columns

[]

In [9]:
replace_dict = {
    'yearsEmployment': {
        '< 1 year': 0.5,
        '1 year': 1,
        '2-5 years': 3,
        '6-9 years': 8,
        '10+ years': 12,
    },
    'loanStatus': {
        'Default': 0,
        'Current': 1,
    },
    'term': {
        '60 months': 60,
        '48 months': 48,
        '36 months': 36,
    },
    'grade': {
        'E3': 15,
        'E2': 14,
        'E1': 13,
        'D3': 12,
        'D2': 11,
        'D1': 10,
        'C3': 9,
        'C2': 8,
        'C1': 7,
        'B3': 6,
        'B2': 5,
        'B1': 4,
        'A3': 3,
        'A2': 2,
        'A1': 1,
    }
}

merged_df = merged_df.replace(replace_dict)

In [10]:
object_var = merged_df.select_dtypes(include=['object']).copy( )
numeric_var = merged_df.select_dtypes(include=['int64', 'float64']).copy()

for col in object_var.columns:
    object_var = pd.get_dummies(object_var, columns=[col])

cleaned_df = pd.concat([object_var, numeric_var], axis=1)
cleaned_df.dtypes

homeOwnership_mortgage        uint8
homeOwnership_own             uint8
homeOwnership_rent            uint8
yearsEmployment             float64
annualIncome                  int64
dtiRatio                    float64
lengthCreditHistory           int64
numTotalCreditLines           int64
numOpenCreditLines          float64
numOpenCreditLines1Year       int64
revolvingBalance              int64
revolvingUtilizationRate    float64
numDerogatoryRec              int64
numDelinquency2Years          int64
numChargeoff1year             int64
numInquiries6Mon              int64
isJointApplication          float64
loanAmount                  float64
term                          int64
interestRate                float64
monthlyPayment                int64
grade                         int64
loanStatus                    int64
dtype: object

In [11]:
cleaned_df.loanStatus.value_counts()

1    89996
0    10004
Name: loanStatus, dtype: int64

In [12]:
class_yes = cleaned_df[cleaned_df['loanStatus'] == 1]
class_no = cleaned_df[cleaned_df['loanStatus'] == 0]

n = round(len(class_no) * 1.5)
balanced_df = class_no.append(class_yes.sample(n))

balanced_df.loanStatus.value_counts()

1    15006
0    10004
Name: loanStatus, dtype: int64

In [13]:
balanced_df.to_csv('data/loan_cleaned.csv')
balanced_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,homeOwnership_mortgage,homeOwnership_own,homeOwnership_rent,yearsEmployment,annualIncome,dtiRatio,lengthCreditHistory,numTotalCreditLines,numOpenCreditLines,numOpenCreditLines1Year,...,numDelinquency2Years,numChargeoff1year,numInquiries6Mon,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus
loanId,memberId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1821279,2124878,1,0,0,1.0,47946,27.06,9,9,5.0,5,...,0,0,1,0.0,17101.0,60,18.01,434,12,0
1891878,2248022,1,0,0,0.5,51844,21.72,10,17,15.0,11,...,0,0,0,0.0,23785.0,36,13.1,803,6,0
1267133,2876501,0,0,1,0.5,54799,22.16,3,15,11.0,10,...,0,0,2,0.0,14246.0,48,14.93,396,15,0
1374863,2027352,0,1,0,8.0,52317,12.69,37,10,7.0,6,...,11,11,3,0.0,17298.0,48,7.07,415,7,0
1679646,2486530,1,0,0,1.0,53036,24.3,4,21,19.0,10,...,0,0,0,0.0,23014.0,36,9.0,732,13,0


# Gini Impurity

In [None]:
def gini_index(groups, classes):

    instances = 0
    for group in groups:
        instances += float(len(group))

    gini = 0.0
    for group in groups:
        size = float(len(group))

        if size == 0:
            continue
        
        score = 0.0
        for val in classes:
            
            p = 0
            for row in group:
                if row[-1] == val:
                    p += 1

            p = p / size
            score += p * p

        gini += (1.0 - score) * (size / instances)
    
    return gini


def make_split_group(index, value, dataset):
    less_than = list()
    greater_than = list()
    for row in dataset:
        if row[index] < value:
            less_than.append(row)
        else:
            greater_than.append(row)
    return less_than, greater_than


def get_split(dataset):
    
    b_index = 999
    b_value = 999
    b_score = 999
    b_groups = None
    
    class_values = list(set(row[-1] for row in dataset))
    feature_cols = len(dataset[0]) - 1
    
    for index in range(feature_cols):
        unique_val = np.unique(np.array(dataset)[:, index])
        for val in unique_val:
            groups = make_split_group(index, val, dataset)
            gini = gini_index(groups, class_values)
            
            if gini < b_score:
                b_index = index
                b_value = val
                b_score = gini
                b_groups = groups
    
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Build Tree

In [None]:
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)
        
# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

# Print a decision tree
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s[%s]' % ((depth*' ', node)))

# Prediction

In [None]:
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Loan Credit Risk Prediction

In [None]:
# convert string attributes to integers
# for i in range(len(loan_np[0])):
#     str_column_to_float(loan_np, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(loan_np, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

In [None]:
train_df = cleaned_df.sample(1000)
train_np = np.array(train_df.values)
test_np = np.array(cleaned_df.sample(100).values)
train_df.head()

In [None]:
tree_loan = build_tree(train_np, 19, 1)

In [None]:
true_pred = 0

for row in test_np:
    prediction = predict(tree_loan, row)
    print('Expected=%d, Got=%d' % (row[-1], prediction))
    
    if row[-1] == prediction:
        true_pred += 1
        
print('\nAccuracy: ' + str(true_pred) + ' out of ' + str(len(test_np)))

# Banknote Case Study

In [None]:
# Load a CSV file
def load_csv(filename):
    file = open(filename, "rt")
    lines = reader(file)
    dataset = list(lines)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
    tree = build_tree(train, max_depth, min_size)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return(predictions)

In [None]:
# Test CART on Bank Note dataset
seed(1)
# load and prepare data
filename = 'data/data_banknote_authentication.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

# Small Example

In [None]:
dataset = np.array([[2.771244718,1.784783929,0],
    [1.728571309,1.169761413,0],
    [3.678319846,2.81281357,0],
    [3.961043357,2.61995032,0],
    [2.999208922,2.209014212,0],
    [7.497545867,3.162953546,1],
    [9.00220326,3.339047188,1],
    [7.444542326,0.476683375,1],
    [10.12493903,3.234550982,1],
    [6.642287351,3.319983761,1]])
tree_example = build_tree(dataset, 1, 1)

for row in dataset:
    prediction = predict(tree_example, row)
    print('Expected=%d, Got=%d' % (row[-1], prediction))

In [None]:
split = get_split(dataset)
split['groups']