# Applying Decision Tree for Loan prediction

In [1]:
import graphlab
import math

A newer version of GraphLab Create (v1.8.5) is available! Your current version is v1.8.4.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


In [2]:
loans = graphlab.SFrame.read_csv('train_u6lujuX.csv')
test_data = graphlab.SFrame.read_csv('test_Y3wMUE5.csv')
loans.head()

2016-04-01 12:03:04,688 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.4 started. Logging: C:\Users\ranjank\AppData\Local\Temp\graphlab_server_1459492382.log.0


This non-commercial license of GraphLab Create is assigned to ranjank@cdac.in and will expire on November 23, 2016. For commercial licensing options, visit https://dato.com/buy/.
------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[str,str,str,str,str,str,long,long,long,long,long,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[str,str,str,str,str,str,long,long,long,long,long,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount
LP001002,Male,No,0,Graduate,No,5849,0,
LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0
LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0
LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0
LP001008,Male,No,0,Graduate,No,6000,0,141.0
LP001011,Male,Yes,2,Graduate,Yes,5417,4196,267.0
LP001013,Male,Yes,0,Not Graduate,No,2333,1516,95.0
LP001014,Male,Yes,3+,Graduate,No,3036,2504,158.0
LP001018,Male,Yes,2,Graduate,No,4006,1526,168.0
LP001020,Male,Yes,1,Graduate,No,12841,10968,349.0

Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
360,1,Urban,Y
360,1,Rural,N
360,1,Urban,Y
360,1,Urban,Y
360,1,Urban,Y
360,1,Urban,Y
360,1,Urban,Y
360,0,Semiurban,N
360,1,Urban,Y
360,1,Semiurban,N


### Features and Target

In [3]:
features = ['Gender', 
            'Married', 
            'Dependents',
            'Education',
            'Self_Employed',
            'ApplicantIncome', 
            'CoapplicantIncome',
            'LoanAmount',
            'Loan_Amount_Term',
            'Credit_History',
            'Property_Area']
target = 'Loan_Status'

loans = loans[features + [target]]

## Preparing the data

* **Is target column fully populated with proper data?**
* **Replace the target values to 1 and -1**

In [4]:
print "Is target column fully populated with proper data: ", (len(loans[loans[target] == 'Y']) + 
                                                              len(loans[loans[target] == 'N'])) == len(loans)
loans[target] = loans[target].apply(lambda x: 1 if x=='Y' else -1)

Is target column fully populated with proper data:  True


## Subsample dataset to make sure classes are balanced

In [5]:
#safe_loans_raw = loans[loans[target] == 1]
#risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
#percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
#safe_loans = safe_loans_raw.sample(percentage, seed = 1)
#risky_loans = risky_loans_raw
#loans_data = risky_loans.append(safe_loans)

#print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
#print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
#print "Total number of loans in our new dataset :", len(loans_data)
loan_data = loans

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 384


## Transform categorical data into binary features

### Training Data

Categorize 'ApplicantIncome'

In [6]:
def fun_ApplicantIncome(x):
    if x < 1000:
        return 'range_0_1k'
    elif x < 2000:
        return 'range_1_2k'
    elif x < 3000:
        return 'range_2_3k'
    elif x < 4000:
        return 'range_3_4k'
    elif x < 5000:
        return 'range_4_5k'
    elif x < 6000:
        return 'range_5_6k'
    elif x < 7000:
        return 'range_6_7k'
    elif x < 8000:
        return 'range_7_8k'
    elif x < 9000:
        return 'range_8_9k'
    elif x < 10000:
        return 'range_9_10k'
    elif x < 11000:
        return 'range_10_11k'
    elif x < 12000:
        return 'range_11_12k'
    elif x < 13000:
        return 'range_12_13k'
    elif x < 14000:
        return 'range_13_14k'
    elif x < 15000:
        return 'range_14_15k'
    elif x < 16000:
        return 'range_15_16k'
    elif x < 17000:
        return 'range_16_17k'
    elif x < 18000:
        return 'range_17_18k'
    elif x < 19000:
        return 'range_18_19k'
    elif x < 20000:
        return 'range_19_20k'
    elif x < 30000:
        return 'range_20_21k'
    elif x >= 30000:
        return 'range_high'
loans_data['ApplicantIncome'] = loans_data['ApplicantIncome'].apply(lambda x: fun_ApplicantIncome(x))
test_data['ApplicantIncome'] = test_data['ApplicantIncome'].apply(lambda x: fun_ApplicantIncome(x))

Categorize 'CoapplicantIncome'

In [7]:
def fun_CoapplicantIncome(x):
    if x < 1000:
        return 'range_0_1k'
    elif x < 2000:
        return 'range_1_2k'
    elif x < 3000:
        return 'range_2_3k'
    elif x < 4000:
        return 'range_3_4k'
    elif x < 5000:
        return 'range_4_5k'
    elif x < 6000:
        return 'range_5_6k'
    elif x < 7000:
        return 'range_6_7k'
    elif x < 8000:
        return 'range_7_8k'
    elif x < 9000:
        return 'range_8_9k'
    elif x < 10000:
        return 'range_9_10k'
    elif x < 11000:
        return 'range_10_11k'
    elif x < 12000:
        return 'range_11_12k'
    elif x < 13000:
        return 'range_12_13k'
    elif x < 14000:
        return 'range_13_14k'
    elif x < 15000:
        return 'range_14_15k'
    elif x < 16000:
        return 'range_15_16k'
    elif x < 17000:
        return 'range_16_17k'
    elif x < 18000:
        return 'range_17_18k'
    elif x < 19000:
        return 'range_18_19k'
    elif x < 20000:
        return 'range_19_20k'
    elif x < 30000:
        return 'range_20_21k'
    elif x >= 30000:
        return 'range_high'
loans_data['CoapplicantIncome'] = loans_data['CoapplicantIncome'].apply(lambda x: fun_CoapplicantIncome(x))
test_data['CoapplicantIncome'] = test_data['CoapplicantIncome'].apply(lambda x: fun_CoapplicantIncome(x))

Categorize 'LoanAmountTerm'

In [8]:
#loan_amount_term = [12, 36, 60, 84, 120, 180, 240, 300, 360, 480]

def fun_LoanAmountTerm(x):
    if x in loans_data['Loan_Amount_Term']:
        return x
    else:
        return 0
    
loans_data['Loan_Amount_Term'] = loans_data['Loan_Amount_Term'].apply(lambda x: fun_LoanAmountTerm(x))
test_data['Loan_Amount_Term'] = test_data['Loan_Amount_Term'].apply(lambda x: fun_LoanAmountTerm(x))

Categorize 'LoanAmount'

In [9]:
def fun_LoanAmount(x):
    if x < 100:
        return 'range_0_1h'
    elif x < 200:
        return 'range_1_2h'
    elif x < 300:
        return 'range_2_3h'
    elif x < 400:
        return 'range_3_4'
    elif x < 500:
        return 'range_4_5'
    elif x < 600:
        return 'range_5_6'
    else:
        return 'range_high'
    
loans_data['LoanAmount'] = loans_data['LoanAmount'].apply(lambda x: fun_LoanAmount(x))
test_data['LoanAmount'] = test_data['LoanAmount'].apply(lambda x: fun_LoanAmount(x))

Converting features to 0 and 1 based features

In [10]:
#loans_data = risky_loans.append(safe_loans)
for feature in features:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})    
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
    
    # Change None's to 0's
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)

    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)

### Test Data

In [11]:
for feature in features:
    test_data_one_hot_encoded = test_data[feature].apply(lambda x: {x: 1})    
    test_data_unpacked = test_data_one_hot_encoded.unpack(column_name_prefix=feature)
    
    # Change None's to 0's
    for column in test_data_unpacked.column_names():
        test_data_unpacked[column] = test_data_unpacked[column].fillna(0)

    test_data.remove_column(feature)
    test_data.add_columns(test_data_unpacked)

In [12]:
features = loans_data.column_names()
features.remove('Loan_Status')  # Remove the response variable
features

['Gender.',
 'Gender.Female',
 'Gender.Male',
 'Married.',
 'Married.No',
 'Married.Yes',
 'Dependents.',
 'Dependents.0',
 'Dependents.1',
 'Dependents.2',
 'Dependents.3+',
 'Education.Graduate',
 'Education.Not Graduate',
 'Self_Employed.',
 'Self_Employed.No',
 'Self_Employed.Yes',
 'ApplicantIncome.range_0_1k',
 'ApplicantIncome.range_10_11k',
 'ApplicantIncome.range_11_12k',
 'ApplicantIncome.range_12_13k',
 'ApplicantIncome.range_13_14k',
 'ApplicantIncome.range_14_15k',
 'ApplicantIncome.range_16_17k',
 'ApplicantIncome.range_17_18k',
 'ApplicantIncome.range_18_19k',
 'ApplicantIncome.range_19_20k',
 'ApplicantIncome.range_1_2k',
 'ApplicantIncome.range_20_21k',
 'ApplicantIncome.range_2_3k',
 'ApplicantIncome.range_3_4k',
 'ApplicantIncome.range_4_5k',
 'ApplicantIncome.range_5_6k',
 'ApplicantIncome.range_6_7k',
 'ApplicantIncome.range_7_8k',
 'ApplicantIncome.range_8_9k',
 'ApplicantIncome.range_9_10k',
 'ApplicantIncome.range_high',
 'CoapplicantIncome.range_0_1k',
 'Coappl

In [13]:
print "Number of features (after binarizing categorical variables) = %s" % len(features)

Number of features (after binarizing categorical variables) = 71


Let's explore what one of these columns looks like:

In [14]:
loans_data['Gender.Male']

dtype: int
Rows: 384
[1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, ... ]

#Decision tree implementation

In [15]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0
    
    # Count the number of 1's (safe loans)
    count_of_1 = 0
    for label in labels_in_node:
        if label == 1:
            count_of_1 += 1
        
    # Count the number of -1's (risky loans)
    count_of_minus_1 = 0
    for label in labels_in_node:
        if label == -1:
            count_of_minus_1 += 1
                
    # Return the number of mistakes that the majority classifier makes.
    if count_of_1 > count_of_minus_1:
        return count_of_minus_1
    else:
        return count_of_1    

In [16]:
# Test case 1
example_labels = graphlab.SArray([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 1 failed... try again!'

# Test case 2
example_labels = graphlab.SArray([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 2 failed... try again!'
    
# Test case 3
example_labels = graphlab.SArray([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'

Test passed!
Test passed!
Test passed!


## Function to pick best feature to split on

In [17]:
def best_splitting_feature(data, features, target):
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:

        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split =  data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        left_mistakes = intermediate_node_num_mistakes(left_split[target])   

        # Calculate the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error

        if error < best_error:
            best_error = error
            best_feature = feature
    
    return best_feature # Return the best feature we found

## Building the tree

In [18]:
def create_leaf(target_values):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True    }   ## YOUR CODE HERE
    
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = +1         ## YOUR CODE HERE
    else:
        leaf['prediction'] = -1        ## YOUR CODE HERE
        
    # Return the leaf node        
    return leaf 

In [19]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if intermediate_node_num_mistakes(target_values) == 0:  ## YOUR CODE HERE
        print "Stopping condition 1 reached."     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if len(remaining_features) == 0:   ## YOUR CODE HERE
        print "Stopping condition 2 reached."    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth:  ## YOUR CODE HERE
        print "Reached maximum depth. Stopping for now."
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    splitting_feature = best_splitting_feature(data, features, target)

    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]      ## YOUR CODE HERE
    remaining_features.remove(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print "Creating leaf node."
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Creating leaf node."
        return create_leaf(right_split[target]) ## YOUR CODE HERE

        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)        
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [20]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

##Build the tree!

In [21]:
# Make sure to cap the depth at 6 by using max_depth = 6
my_decision_tree = decision_tree_create(loans_data, features, 'Loan_Status', max_depth = 3)

--------------------------------------------------------------------
Subtree, depth = 0 (384 data points).
Split on feature Credit_History.0. (298, 86)
--------------------------------------------------------------------
Subtree, depth = 1 (298 data points).
Split on feature LoanAmount.range_3_4. (290, 8)
--------------------------------------------------------------------
Subtree, depth = 2 (290 data points).
Split on feature ApplicantIncome.range_1_2k. (276, 14)
--------------------------------------------------------------------
Subtree, depth = 3 (276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (14 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (8 data points).
Split on feature CoapplicantIncome.range_0_1k. (6, 2)
--------------------------------------------------------------------
Subtree,

## Making predictions with a decision tree

In [22]:
def classify(tree, x, annotate = False):   
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction'] 
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)
               ### YOUR CODE HERE

## Making predictions using the tree

In [23]:
classify(my_decision_tree, test_data, annotate=True)

Split on Credit_History.0 = [0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, ... ]
Split on LoanAmount.range_3_4 = [0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, ... ]
Split on ApplicantIncome.range_1_2k = [0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,

1

In [24]:
prediction = test_data.apply(lambda x: classify(my_decision_tree, x))

In [25]:
print prediction

[1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, -1L, -1L, -1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, -1L, -1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, -1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, ... ]


## Writing the submission file

In [26]:
import pandas as pd
loan_id = test_data['Loan_ID']
predictionsS = pd.Series(prediction)
loan_status_predictions = predictionsS.apply(lambda x: 'Y' if x == 1 else 'N')

In [27]:
import csv
with open('submission.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Loan_ID', 'Loan_Status'])
    for i in xrange(len(loan_id)):
        spamwriter.writerow([loan_id[i], loan_status_predictions[i]])

In [28]:
#print len(loans)

#x = np.arange(0, len(loans['ApplicantIncome'])) 
#x = loans['CoapplicantIncome']
#y = loans['Dependents']

#ll = plt.plot(x,y)
#plt.show()