# Homework 3.1

In [15]:
import pandas as pd
import numpy as np
import json

### 1. Data Preprocessing

Read datasets

In [2]:
loans = pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Reassign the target to +1 and -1

In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x == 0 else -1)
loans = loans.drop('bad_loans', axis = 1)

We use only a subset of features

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [5]:
loans = loans[features + [target]]

In [6]:
loans.shape

(122607, 5)

One-hot encoding

In [11]:
loans = pd.get_dummies(loans)

In [12]:
loans.columns

Index([u'safe_loans', u'grade_A', u'grade_B', u'grade_C', u'grade_D',
       u'grade_E', u'grade_F', u'grade_G', u'term_ 36 months',
       u'term_ 60 months', u'home_ownership_MORTGAGE', u'home_ownership_OTHER',
       u'home_ownership_OWN', u'home_ownership_RENT', u'emp_length_1 year',
       u'emp_length_10+ years', u'emp_length_2 years', u'emp_length_3 years',
       u'emp_length_4 years', u'emp_length_5 years', u'emp_length_6 years',
       u'emp_length_7 years', u'emp_length_8 years', u'emp_length_9 years',
       u'emp_length_< 1 year', u'emp_length_n/a'],
      dtype='object')

Modify the list of features after one-hot encoding

In [51]:
features = loans.columns.tolist()
features.remove('safe_loans')

Note: We use train and test indices offered by the lecture, so we can omit the next step 'Subsample dataset to make sure classes are balanced'.

#### Subsample dataset to make sure classes are balanced

In [20]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
# Since there are fewer risky loans than safe loans, use the number of risky loans to undersample the safe loans.
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(len(risky_loans), random_state = 1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


### 2. Splitting of train and test datasets

In [75]:
with open ('module-5-assignment-2-train-idx.json') as f:
    train_idx = json.load(f)
with open ('module-5-assignment-2-test-idx.json') as f:
    test_idx = json.load(f) 

In [26]:
train_data = loans.iloc[train_idx]
test_data = loans.iloc[test_idx]

In [76]:
print train_data.shape
print test_data.shape

(37224, 26)
(9284, 26)


### 3. Decision tree implementation from scratch

Note: Remember that since we are only dealing with binary features, we do not have to consider thresholds for real-valued features. This makes the implementation of this function much easier.

Build a function to count number of mistakes while predicting majority class

In [31]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0    
    # Count the number of 1's (safe loans)
    num_positive = sum(labels_in_node == 1)
    # Count the number of -1's (risky loans)
    num_negative = sum(labels_in_node == -1)               
    # Return the number of mistakes that the majority classifier makes.
    return num_negative if num_positive > num_negative else num_positive

Test the above function:

In [38]:
# Test case 1
example_labels = np.array([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 1 failed... try again!'

# Test case 2
example_labels = np.array([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'
    
# Test case 3
example_labels = np.array([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'

Test passed!
Test passed!
Test passed!


Build a function to pick best feature to split on

In [62]:
def best_splitting_feature(data, features, target):
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split = data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        left_mistakes =  intermediate_node_num_mistakes(left_split[target])           

        # Calculate the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        if error < best_error:
            best_feature = feature
            best_error = error      
    
    return best_feature # Return the best feature we found

Test the above function

In [65]:
print best_splitting_feature(train_data, features, 'safe_loans')

term_ 36 months


#### Building the tree

First, we will write a function that creates a leaf node given a set of target values.

In [40]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True}
   
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])    

    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = +1
    else:
        leaf['prediction'] = -1      

    # Return the leaf node
    return leaf 

Now, implement the learning alogrithm (recursive alogrithm): Base case + Recursive case

In [42]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node. 
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if intermediate_node_num_mistakes(target_values) == 0: # Base case
        print "Stopping condition 1 reached."     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if remaining_features == []:   
        print "Stopping condition 2 reached."    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth:  
        print "Reached maximum depth. Stopping for now."
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    splitting_feature = best_splitting_feature(data, features, target)

    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    remaining_features.remove(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print "Creating leaf node."
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Creating leaf node."
        return create_leaf(right_split[target])
   
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth) # recursive case    
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth) # recursive case  

    return {'is_leaf'          : False, 
            'prediction'       : None, # this feature is from 'create_leaf' function
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

Define a function to calculate the number of nodes.

In [44]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

Test the above function.

In [64]:
small_data_decision_tree = decision_tree_create(train_data, features, 'safe_loans', max_depth = 3)
if count_nodes(small_data_decision_tree) == 13:
    print 'Test passed!'
else:
    print 'Test failed... try again!'
    print 'Number of nodes found                :', count_nodes(small_data_decision_tree)
    print 'Number of nodes that should be there : 13' 

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (1048 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_n/a. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points)

Build the tree and test the above functions

In [66]:
my_decision_tree = decision_tree_create(train_data, features, target, max_depth = 6)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

### 4. Making predictions with a decision tree

Write a function classify to realize the prediction

In [71]:
def classify(tree, x, annotate = False):
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction']
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)      

In [109]:
my_decision_tree['']

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
24,-1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,-1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,-1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132,-1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,-1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
162,-1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,-1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
184,-1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,-1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Test the above function

In [88]:
print test_data.iloc[0]
print 'Predicted class: %s ' % classify(my_decision_tree, test_data.iloc[0])

safe_loans                -1.0
grade_A                    0.0
grade_B                    0.0
grade_C                    0.0
grade_D                    1.0
grade_E                    0.0
grade_F                    0.0
grade_G                    0.0
term_ 36 months            0.0
term_ 60 months            1.0
home_ownership_MORTGAGE    0.0
home_ownership_OTHER       0.0
home_ownership_OWN         0.0
home_ownership_RENT        1.0
emp_length_1 year          0.0
emp_length_10+ years       0.0
emp_length_2 years         1.0
emp_length_3 years         0.0
emp_length_4 years         0.0
emp_length_5 years         0.0
emp_length_6 years         0.0
emp_length_7 years         0.0
emp_length_8 years         0.0
emp_length_9 years         0.0
emp_length_< 1 year        0.0
emp_length_n/a             0.0
Name: 24, dtype: float64
Predicted class: -1 


In [101]:
classify(my_decision_tree, test_data.iloc[0], annotate=True)

Split on term_ 36 months = 0.0
Split on grade_A = 0.0
Split on grade_B = 0.0
Split on grade_C = 0.0
Split on grade_D = 1.0
At leaf, predicting -1


-1

### 5. Evaluation the decision tree

In [112]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x), axis = 1) # 'axis = 1' is a very important part because we need assign the 'apply' to rows in the DataFrame
    
    # Once you've made the predictions, calculate the classification error and return it
    error = sum(data[target] != prediction) / float(len(data)) 
    return error

In [113]:
evaluate_classification_error(my_decision_tree, train_data)

0.38185041908446166

### 6. Printing out a decision stump

A function to visualize the decision stump

In [116]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    split_feature, split_value = split_name.split('_')
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

We can use this function to check every step of this decision tree.

In [117]:
print_stump(my_decision_tree)

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [term_ 36 months == 0]               [term_ 36 months == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


#### Exploring the intermediate left subtree

Left Side

In [118]:
print_stump(my_decision_tree['left'], my_decision_tree['splitting_feature'])

                       term_ 36 months
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_A == 0]               [grade_A == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [119]:
print_stump(my_decision_tree['left']['left'], my_decision_tree['left']['splitting_feature'])

                       grade_A
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_B == 0]               [grade_B == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


Right Side

In [120]:
print_stump(my_decision_tree['right'], my_decision_tree['splitting_feature'])

                       term_ 36 months
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_D == 0]               [grade_D == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: -1)


In [121]:
print_stump(my_decision_tree['right']['right'], my_decision_tree['right']['splitting_feature'])

(leaf, label: -1)


In [1]:
l = [2, 3, 2, 4, 5, 1, 2]