In [576]:
import pandas as pd
import sklearn, sklearn.tree, numpy
import graphviz

In [577]:
loandata = pd.read_csv('data/lending-club-data.csv', dtype={'url':'string', 'next_pymnt_d':'string'})

In [578]:
loandata['safe_loan'] = loandata['bad_loans'].apply(lambda x: +1 if x==0 else -1)

In [579]:
loandata = loandata.drop('bad_loans',1)
loandata.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc',
       'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
       'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
       'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans',
       'emp_length_num', 'grade_num', 'sub_grade_num

In [580]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loan'

In [581]:
loandata = loandata[features + [target]]
#loandata = loandata[features]
print loandata.head()
print loandata.columns.values

  grade        term home_ownership emp_length  safe_loan
0     B   36 months           RENT  10+ years          1
1     C   60 months           RENT   < 1 year         -1
2     C   36 months           RENT  10+ years          1
3     C   36 months           RENT  10+ years          1
4     A   36 months           RENT    3 years          1
['grade' 'term' 'home_ownership' 'emp_length' 'safe_loan']


In [582]:
# one hot encoding converts the data aset into machine splittable version so we are not deailing
# with strings
loans_ohe = pd.get_dummies(loandata)
loans_ohe.head()
ohe_features= loans_ohe.columns.values
type(ohe_features)
print ohe_features

['safe_loan' 'grade_A' 'grade_B' 'grade_C' 'grade_D' 'grade_E' 'grade_F'
 'grade_G' 'term_ 36 months' 'term_ 60 months' 'home_ownership_MORTGAGE'
 'home_ownership_OTHER' 'home_ownership_OWN' 'home_ownership_RENT'
 'emp_length_1 year' 'emp_length_10+ years' 'emp_length_2 years'
 'emp_length_3 years' 'emp_length_4 years' 'emp_length_5 years'
 'emp_length_6 years' 'emp_length_7 years' 'emp_length_8 years'
 'emp_length_9 years' 'emp_length_< 1 year' 'emp_length_n/a']


In [583]:
#load the train and validation idx files downloaded
import json

with open('data/module-5-assignment-2-train-idx.json', 'r') as f: # Reads the list of most frequent words
    train_idx = json.load(f)
f.close()

with open('data/module-5-assignment-2-test-idx.json','r') as f:
    test_idx = json.load(f)
f.close()

In [584]:
# select the specific samples from the training and validation set in the loans one hot encoded data
train_data = loans_ohe.iloc[train_idx]
test_data= loans_ohe.iloc[test_idx]

In [585]:
def intermediate_node_num_mistakes(labels_in_node):
    #corner caseL if labes_in_node is empty, return 0
    #print type(labels_in_node)
    #print len(labels_in_node)
    #print labels_in_node[0]
    if len(labels_in_node) == 0:
        return 0
    # count the number of 1's (safe loans)
    safe_loans = [x for x in labels_in_node if x == 1]

    # count unsafe loans
    unsafe_loans = filter(lambda x: x == -1, labels_in_node)
    #print "safe loans = %d unsafe loans = %d" % (len(safe_loans), len(unsafe_loans))
    return len(safe_loans) if  len(safe_loans) < (len(unsafe_loans)) else len(unsafe_loans)
    

In [586]:
# Test case 1
example_labels = pd.DataFrame([-1, -1, 1, 1, 1]).values.flatten()
print example_labels
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 1 failed... try again!'
# Test case 2
example_labels = pd.DataFrame([-1, -1, 1, 1, 1, 1, 1]).values.flatten()
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'
    
# Test case 3
example_labels = pd.DataFrame([-1, -1, -1, -1, -1, 1, 1]).values.flatten()
if intermediate_node_num_mistakes(example_labels) == 2:
    print 'Test passed!'
else:
    print 'Test 3 failed... try again!'

[-1 -1  1  1  1]
Test passed!
Test passed!
Test passed!


In [587]:
def best_splitting_feature(data,features,target):
    target_values = data[target]
    best_feature = None
    best_error = 10
    
    #convert to float
    num_data_points = float(len(data))
    mfeatures = numpy.delete(features,0)
    #consider each feature to decide what to split on
    for feature in mfeatures:
        
        left_split = data[data[feature] == 0]
        #print left_split[target]
        
        right_split = data[data[feature] == 1]
        
        #left side misclassicifed
        
        left_mistakes = intermediate_node_num_mistakes(left_split[target])
        
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
        
        error = (left_mistakes + right_mistakes)/num_data_points
        #print "error %f feature %s" % (error, feature)
        if error < best_error:
            print "setting best feature to %s for error %f" %(feature, error)
            best_error = error
            best_feature = feature
        
    print "Best Feature is %s" % best_feature        
    return best_feature
        
        

In [588]:
def create_leaf(target_values):
    #a leaf node
    leaf = {'splitting_feature': None,
            'left': None,
            'right': None,
            'is_leaf': True
           }
    # count the number of data points that are +1 and -1 in this node
    
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # for the leaf node, set the prediction to be the majority class
    # store the prediced class (1 or -1) in leaf['prediction']
    
    if num_ones > num_minus_ones:
        leaf['prediction'] = +1
    else:
        leaf['prediction'] = -1
        
    return leaf
    

In [589]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    
    target_values = data[target]
    #data.drop(target,axis=1)
    remaining_features = features[:] # Make a copy of the features.

    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if  len(target_values)== 0:  ## YOUR CODE HERE
        print "Stopping condition 1 reached."     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if len(remaining_features) == 0 :   ## YOUR CODE HERE
        print "Stopping condition 2 reached."    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth :  ## YOUR CODE HERE
        print "Reached maximum depth. Stopping for now."
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    ## YOUR CODE HERE
    splitting_feature = best_splitting_feature(data,features,target)
    print "**** Splitting on %s" % splitting_feature
    # Split on the best feature that we found. 
    left_split  = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] ==1]
    #print splitting_feature
    #print remaining_features
    itemindex = numpy.where(remaining_features==splitting_feature)[0]
    remaining_features = numpy.delete(remaining_features,itemindex[0])
    print "***"
    print "splitting feature %s at index %d should be removed" % (splitting_feature, itemindex[0])
    print remaining_features
    print "***"

    #remaining_features.drop(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print "Creating leaf node."
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Creating leaf node."
        return create_leaf(right_split[target])
        ## YOUR CODE HERE

        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)        
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features,target, current_depth+1, max_depth)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [590]:
#print ohe_features
#print type(ohe_features)
#mfeatures = numpy.delete(ohe_features,0)
#print mfeatures
tree_model = decision_tree_create(train_data,ohe_features,target,max_depth=6)
#remaining_features = ohe_features[:]
#print len(remaining_features)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
setting best feature to grade_A for error 0.433430
setting best feature to term_ 36 months for error 0.421637
Best Feature is term_ 36 months
**** Splitting on term_ 36 months
***
splitting feature term_ 36 months at index 8 should be removed
['safe_loan' 'grade_A' 'grade_B' 'grade_C' 'grade_D' 'grade_E' 'grade_F'
 'grade_G' 'term_ 60 months' 'home_ownership_MORTGAGE'
 'home_ownership_OTHER' 'home_ownership_OWN' 'home_ownership_RENT'
 'emp_length_1 year' 'emp_length_10+ years' 'emp_length_2 years'
 'emp_length_3 years' 'emp_length_4 years' 'emp_length_5 years'
 'emp_length_6 years' 'emp_length_7 years' 'emp_length_8 years'
 'emp_length_9 years' 'emp_length_< 1 year' 'emp_length_n/a']
***
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
setting best feature to grade_A for er

In [591]:
#left_split = train_data[train_data[ohe_features[1]] == 0]
#print left_split
#itemindex = numpy.where(ohe_features=='emp_length_n/a')[0]
#print itemindex[0]
##foo = ohe_features
#print foo
#foo = numpy.delete(foo,itemindex[0])
#print foo
#print test_data[24]

In [592]:
test_data.iloc[0]

safe_loan                 -1
grade_A                    0
grade_B                    0
grade_C                    0
grade_D                    1
grade_E                    0
grade_F                    0
grade_G                    0
term_ 36 months            0
term_ 60 months            1
home_ownership_MORTGAGE    0
home_ownership_OTHER       0
home_ownership_OWN         0
home_ownership_RENT        1
emp_length_1 year          0
emp_length_10+ years       0
emp_length_2 years         1
emp_length_3 years         0
emp_length_4 years         0
emp_length_5 years         0
emp_length_6 years         0
emp_length_7 years         0
emp_length_8 years         0
emp_length_9 years         0
emp_length_< 1 year        0
emp_length_n/a             0
Name: 24, dtype: float64

In [593]:
def classify(tree,x, annotate = False):
    # if the node is a leaf node
    if tree['is_leaf']:
        if annotate:
            print "at leaf, predicting %s" % tree['prediction']
        return tree['prediction']
    else:
        # split on feature
        split_feature_value =x[tree['splitting_feature']]
        if annotate:
            print "Split on %s=%s" %(tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'],x,annotate)
        else:
            return classify(tree['right'], x, annotate)
        

In [594]:
#print test_data['emp_length_n/a']
print tree_model
print 'predicted class: %s'  % classify(tree_model, test_data.iloc[0], annotate=True)

{'is_leaf': False, 'splitting_feature': 'term_ 36 months', 'right': {'is_leaf': False, 'splitting_feature': 'grade_D', 'right': {'is_leaf': True, 'splitting_feature': None, 'right': None, 'prediction': -1, 'left': None}, 'prediction': None, 'left': {'is_leaf': False, 'splitting_feature': 'grade_E', 'right': {'is_leaf': True, 'splitting_feature': None, 'right': None, 'prediction': -1, 'left': None}, 'prediction': None, 'left': {'is_leaf': False, 'splitting_feature': 'grade_F', 'right': {'is_leaf': False, 'splitting_feature': 'emp_length_8 years', 'right': {'is_leaf': False, 'splitting_feature': 'home_ownership_OWN', 'right': {'is_leaf': True, 'splitting_feature': None, 'right': None, 'prediction': -1, 'left': None}, 'prediction': None, 'left': {'is_leaf': True, 'splitting_feature': None, 'right': None, 'prediction': 1, 'left': None}}, 'prediction': None, 'left': {'is_leaf': True, 'splitting_feature': None, 'right': None, 'prediction': -1, 'left': None}}, 'prediction': None, 'left': {'is

In [595]:
print test_data.columns.values

['safe_loan' 'grade_A' 'grade_B' 'grade_C' 'grade_D' 'grade_E' 'grade_F'
 'grade_G' 'term_ 36 months' 'term_ 60 months' 'home_ownership_MORTGAGE'
 'home_ownership_OTHER' 'home_ownership_OWN' 'home_ownership_RENT'
 'emp_length_1 year' 'emp_length_10+ years' 'emp_length_2 years'
 'emp_length_3 years' 'emp_length_4 years' 'emp_length_5 years'
 'emp_length_6 years' 'emp_length_7 years' 'emp_length_8 years'
 'emp_length_9 years' 'emp_length_< 1 year' 'emp_length_n/a']


In [596]:
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} | {}'.format(*x)

def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x), axis=1)
     #data.apply (lambda x: sys.stdout.write(x), axis=1)
    
    # Once you've made the predictions, calculate the classification error and return it
    ## YOUR CODE HERE
    #print len(prediction)
    # errors = prediction[prediction.isin(data['safe_loan'])]
    #errors = pd.Series(numpy.intersect1d(pd.Series(prediction), pd.Series(data['safe_loan'])))
    #print prediction
    #print data['safe_loan']
    #errors = pd.Series(list(set(prediction).intersection(set(data['safe_loan']))))
    #print errors
    #error = (prediction != data['safe_loan'])
    #print error
    #my_panel = pd.Panel(dict(prediction=prediction,safe=data['safe_loan']))
    #print my_panel.apply(report_diff, axis=1)
    #for i in prediction:
    #    print prediction[i]
    difference = numpy.where(prediction != data['safe_loan'])
    print len(difference[0])
    #print difference
    #print type(prediction)
    #df = pd.concat(prediction, data['safe_loan'])
    #print df
    print float(len(difference[0]))/float(len(data['safe_loan']))

In [597]:

evaluate_classification_error(tree_model, test_data)


3563
0.383778543731


**What is the classification error - ? **0.48**

In [598]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    print split_name
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    #split_feature, split_value = split_name.split('.')
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

In [599]:
print_stump(tree_model)
#print tree_model

term_ 36 months
                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [term_ 36 months == 0]               [term_ 36 months == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [600]:
print_stump(tree_model['left'], tree_model['splitting_feature'])

grade_A
                       term_ 36 months
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_A == 0]               [grade_A == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [601]:
print_stump(tree_model['left']['left'],tree_model['left']['splitting_feature'])

grade_B
                       grade_A
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_B == 0]               [grade_B == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)
