In [7]:
import graphlab
from graphlab import SFrame
graphlab.canvas.set_target('ipynb')
import numpy as np

In [8]:
loans = SFrame('lending-club-data.csv')

This non-commercial license of GraphLab Create for academic use is assigned to mukesh.mithrakumar@jacks.sdstate.edu and will expire on June 17, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1500544058.log


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,int,int,int,str,float,float,str,str,str,str,str,int,str,str,str,str,str,str,str,str,str,str,float,int,str,int,int,int,int,int,int,float,int,str,float,float,float,float,float,float,float,float,float,str,float,str,str,int,str,int,int,str,int,int,int,int,float,int,int,int,int,float,str,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [51]:
#filters out the purpose of the loan as house

sf_filter = loans[loans['purpose'] =='house']
sf_filter.print_rows(20,68)

+---------+-----------+-----------+-------------+-----------------+-----------+
|    id   | member_id | loan_amnt | funded_amnt | funded_amnt_inv |    term   |
+---------+-----------+-----------+-------------+-----------------+-----------+
| 1062177 |  1294027  |   15000   |    15000    |      15000      | 36 months |
| 1049352 |  1280767  |    6600   |     6600    |       6600      | 36 months |
| 1062976 |  1295062  |    4000   |     4000    |       4000      | 60 months |
| 1058564 |  1290157  |    8000   |     8000    |       8000      | 36 months |
| 1061837 |  1293455  |   15000   |    15000    |      14975      | 36 months |
| 1058892 |  1290489  |    8000   |     8000    |       8000      | 36 months |
| 1047771 |  1278668  |   15000   |    15000    |      15000      | 36 months |
| 1047014 |  1277879  |    6000   |     6000    |       6000      | 36 months |
| 1043408 |  1273730  |   35000   |    35000    |      35000      | 36 months |
| 1045841 |  1276260  |   35000   |    3

In [53]:
loans['purpose'].show()

In [54]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [55]:
loans['safe_loans'].show(view = 'Categorical')

In [56]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [57]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 27962
Number of risky loans : 4921


In [67]:
total = (len(safe_loans_raw)+len(risky_loans_raw))
print "Percentage of safe loans  :", len(safe_loans_raw) / float(total)
print "Percentage of risky loans :", len(risky_loans_raw) / float(total)

Percentage of safe loans  : 0.850348204239
Percentage of risky loans : 0.149651795761


In [68]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [69]:
print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.500912778905
Percentage of risky loans                : 0.499087221095
Total number of loans in our new dataset : 9860


In [70]:
train_data, validation_data = loans_data.random_split(.8, seed=1)

In [71]:
decision_tree_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                                target = target, features = features)

In [72]:
small_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 2)

In [73]:
small_model.show(view="Tree")

In [74]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
A,A4,0,6,MORTGAGE,14.03,debt_consolidation,36 months,1
C,C5,0,8,RENT,6.35,credit_card,36 months,1
D,D2,0,3,RENT,13.97,other,60 months,0
A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,27.4,0.0,1
1,60.5,0.0,1
1,59.5,0.0,-1
1,62.1,0.0,-1


In [75]:
test_model = graphlab.decision_tree_classifier.create(sample_validation_data, validation_set=None,
                   target = target, features = features, max_depth = 2)

In [79]:
prob = test_model.predict(sample_validation_data, output_type='probability')
score = test_model.predict(sample_validation_data)

print prob
print score

[0.529964029788971, 0.529964029788971, 0.4501660168170929, 0.4501660168170929]
[1, 1, -1, -1]


In [80]:
(sample_validation_data['safe_loans'] == test_model.predict(sample_validation_data)).sum()/float(len(sample_validation_data))

1.0

In [82]:
prob_small = small_model.predict(sample_validation_data, output_type='probability')
print prob_small

[0.540485680103302, 0.540485680103302, 0.4412866234779358, 0.540485680103302]


In [83]:
sample_validation_data[1]

{'dti': 6.35,
 'emp_length_num': 8,
 'grade': 'C',
 'home_ownership': 'RENT',
 'last_delinq_none': 1,
 'last_major_derog_none': 1,
 'purpose': 'credit_card',
 'revol_util': 60.5,
 'safe_loans': 1,
 'short_emp': 0,
 'sub_grade': 'C5',
 'term': '36 months',
 'total_rec_late_fee': 0.0}

In [84]:
small_model.show(view="Tree")

In [85]:
small_model.predict(sample_validation_data[1])

dtype: int
Rows: 1
[1]

In [87]:
print "small model on the train data: ", small_model.evaluate(train_data)['accuracy']
print "decision model on the train data: ", decision_tree_model.evaluate(train_data)['accuracy']

small model on the train data:  0.668985580572
decision model on the train data:  0.682772577789


In [88]:
print "small model on the validation data: ", small_model.evaluate(validation_data)['accuracy']
print "decision model on the validation data: ", decision_tree_model.evaluate(validation_data)['accuracy']

small model on the validation data:  0.664278403275
decision model on the validation data:  0.673490276356


In [89]:
big_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 10)

In [90]:
print "big model on the train data: ", big_model.evaluate(train_data)['accuracy']
print "big model on the validation data: ", big_model.evaluate(validation_data)['accuracy']

big model on the train data:  0.734758411333
big model on the validation data:  0.646878198567


* **False negatives**: Loans that were actually safe but were predicted to be risky.
* **False positives**: Loans that were actually risky but were predicted to be safe.

In [91]:
predictions = decision_tree_model.predict(validation_data)

In [96]:
sum_false_positive = (validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == -1).sum()
sum_false_negative = (validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == +1).sum()

print "false positive: ", sum_false_positive
print "false negative: ", sum_false_negative

false positive:  459
false negative:  179


In [99]:
total_cost = sum_false_positive*20000 + sum_false_negative*10000
print total_cost

10970000
