# Decision Trees

## Import

In [1]:
import graphlab
graphlab.canvas.set_target('ipynb')

In [None]:
loans = graphlab.SFrame('lending-club-data.gl/')

In [10]:
loans[0: 1]

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade
1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2

emp_title,emp_length,home_ownership,annual_inc,is_inc_v,issue_d,loan_status,pymnt_plan
,10+ years,RENT,24000,Verified,20111201T000000,Fully Paid,n

url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs
https://www.lendingclub.c om/browse/loanDetail. ...,Borrower added on 12/22/11 > I need to ...,credit_card,Computer,860xx,AZ,27.65,0

earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal
19850101T000000,1,,,3,0,13648

revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,...
83.7,9,f,0.0,0.0,5861.07,5831.78,...


## Data exploration

In [3]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [4]:
loans['grade'].show()

In [5]:
loans['home_ownership'].show()

### Transform target

In [11]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x == 0 else -1)
loans = loans.remove_column('bad_loans')

In [12]:
loans['safe_loans'].show(view = 'Categorical')

### Features for the classification

In [13]:
features = ['grade',
            'sub_grade',
            'short_emp',
            'emp_length_num',
            'home_ownership',
            'dti',
            'purpose',
            'term',
            'last_delinq_none',
            'last_major_derog_none',
            'revol_util',
            'total_rec_late_fee',
           ]
target = 'safe_loans'
loans = loans[features + [target]]

In [15]:
loans[0:1]

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B2,0,11,RENT,27.65,credit_card,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,83.7,0.0,1


## Balance classes

In [16]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [21]:
print "Percentage of safe loans  :", (len(safe_loans_raw) * 100) / (len(safe_loans_raw) + len(risky_loans_raw))
print "Percentage of risky loans :", (len(risky_loans_raw) * 100) / (len(safe_loans_raw) + len(risky_loans_raw))

Percentage of safe loans  : 81
Percentage of risky loans : 18


In [22]:
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)
loans_data = risky_loans.append(safe_loans)

In [24]:
print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508


## Split data

In [25]:
train_data, validation_data = loans_data.random_split(.8, seed = 1)

## Build model DTC

In [26]:
decision_tree_model = graphlab.decision_tree_classifier.create(train_data, validation_set = None, target = target, features = features)

## Visualize model

In [27]:
small_model = graphlab.decision_tree_classifier.create(train_data, validation_set = None, target = target, features = features, max_depth = 2)

In [28]:
small_model.show(view = "Tree")

## Make predictions

In [29]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B3,0,11,OWN,11.18,credit_card,36 months,1
D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1
D,D2,0,3,RENT,13.97,other,60 months,0
A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,82.4,0.0,1
1,96.4,0.0,1
1,59.5,0.0,-1
1,62.1,0.0,-1


### Explore predictions

In [46]:
predictions = decision_tree_model.predict(sample_validation_data)
predictions

dtype: int
Rows: 4
[1L, -1L, -1L, 1L]

In [51]:
print sum(sample_validation_data['safe_loans'] == predictions) * 100 / len(sample_validation_data)

50


## Explore probability predictions

In [53]:
decision_tree_model.predict(sample_validation_data, output_type = 'probability')

dtype: float
Rows: 4
[0.5473502278327942, 0.4891221821308136, 0.4559234082698822, 0.5864479541778564]

In [54]:
small_model.predict(sample_validation_data, output_type = 'probability')

dtype: float
Rows: 4
[0.5242817997932434, 0.47226759791374207, 0.47226759791374207, 0.5798847675323486]

## Visualize the prediction on a tree

In [55]:
sample_validation_data[1]

{'dti': 16.85,
 'emp_length_num': 10L,
 'grade': 'D',
 'home_ownership': 'RENT',
 'last_delinq_none': 1L,
 'last_major_derog_none': 1L,
 'purpose': 'debt_consolidation',
 'revol_util': 96.4,
 'safe_loans': 1L,
 'short_emp': 0L,
 'sub_grade': 'D1',
 'term': ' 36 months',
 'total_rec_late_fee': 0.0}

In [56]:
small_model.show(view = "Tree")

In [58]:
small_model.predict(sample_validation_data[1])

dtype: int
Rows: 1
[-1L]

## Model evaluation

In [59]:
print small_model.evaluate(train_data)['accuracy']
print decision_tree_model.evaluate(train_data)['accuracy']

0.613502041694
0.640581345369


In [60]:
print small_model.evaluate(validation_data)['accuracy']
print decision_tree_model.evaluate(validation_data)['accuracy']

0.619345109866
0.636794485136


## Complex DTC

In [61]:
big_model = graphlab.decision_tree_classifier.create(train_data, validation_set = None, target = target, features = features, max_depth = 10)

In [62]:
print big_model.evaluate(train_data)['accuracy']
print big_model.evaluate(validation_data)['accuracy']

0.665538362347
0.627423524343


## Evaluation

In [63]:
predictions = decision_tree_model.predict(validation_data)

In [87]:
false_positives = sum(validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == -1)
print false_positives

1656


In [89]:
false_negatives = sum(validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == 1)
print false_negatives

1716
