# Random Forest - Guided Example 

In [1]:
%%HTML
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton"
value="Show Code"></form>

### Imports 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

### Data

In [3]:
y2015 = pd.read_csv('LendingClub2015.csv', skipinitialspace=True, header=1)

# Note the warning about dtypes.

In [4]:
y2015.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,23850.0,23850.0,23850.0,60 months,17.27%,596.21,D,D3,...,,,Cash,N,,,,,,
1,,,15000.0,15000.0,15000.0,60 months,10.78%,324.5,B,B4,...,,,Cash,N,,,,,,
2,,,7200.0,7200.0,7200.0,36 months,17.27%,257.67,D,D3,...,,,Cash,Y,May-2018,BROKEN,Nov-2017,2761.0,50.01,10.0
3,,,12000.0,12000.0,12000.0,36 months,9.17%,382.55,B,B2,...,,,Cash,N,,,,,,
4,,,11950.0,11950.0,11950.0,36 months,13.44%,405.18,C,C3,...,,,Cash,N,,,,,,


### Random Forest Classifier

In [5]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

In [None]:
rfc = ensemble.RandomForestClassifier()

X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)

cross_val_score(rfc, X, Y, cv=5)

Kernel died attempting the cross_val_score.<br><br>

Find all the features that have over <<__20__>> unique values and remove them from the database.

In [6]:
categorical = y2015.select_dtypes(include=['object'])
under30 = []
for i in categorical:
    column = categorical[i]
    if categorical[i].nunique() > 20:
        under30.append(i)
        print(i)
        print(column.nunique())
print(under30)

int_rate
111
sub_grade
35
emp_title
120812
desc
34
title
27
zip_code
914
addr_state
49
earliest_cr_line
668
revol_util
1211
last_pymnt_d
48
last_credit_pull_d
49
hardship_start_date
23
hardship_end_date
24
payment_plan_start_date
24
debt_settlement_flag_date
41
settlement_date
44
['int_rate', 'sub_grade', 'emp_title', 'desc', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date', 'settlement_date']


Convert certain columns to numeric

In [7]:
# Convert ID and Interest Rate to numeric
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

Drop other unnecessary columns

In [8]:
# Drop other columns with many unique variables
y2015.drop(['int_rate', 'sub_grade', 'emp_title', 'desc', 'title', 'zip_code', 'addr_state', 'earliest_cr_line',
            'revol_util', 'last_pymnt_d', 'last_credit_pull_d', 'hardship_start_date', 'hardship_end_date',
            'payment_plan_start_date', 'debt_settlement_flag_date', 'settlement_date'], axis=1, inplace=True)

Remove summary rows

In [9]:
# Remove two summary rows at the end that don't actually contain data
y2015 = y2015[:-2]

Get dummies

In [10]:
pd.get_dummies(y2015)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,url,dti,delinq_2yrs,...,hardship_loan_status_Current,hardship_loan_status_In Grace Period,hardship_loan_status_Late (16-30 days),hardship_loan_status_Late (31-120 days),disbursement_method_Cash,debt_settlement_flag_N,debt_settlement_flag_Y,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE
0,,,23850.0,23850.0,23850.0,596.21,68046.0,,24.71,1.0,...,0,0,0,0,1,1,0,0,0,0
1,,,15000.0,15000.0,15000.0,324.50,65000.0,,29.28,0.0,...,0,0,0,0,1,1,0,0,0,0
2,,,7200.0,7200.0,7200.0,257.67,20000.0,,13.21,1.0,...,0,0,0,0,1,0,1,0,1,0
3,,,12000.0,12000.0,12000.0,382.55,39400.0,,26.32,0.0,...,0,0,0,0,1,1,0,0,0,0
4,,,11950.0,11950.0,11950.0,405.18,34000.0,,10.20,0.0,...,0,0,0,0,1,1,0,0,0,0
5,,,20000.0,20000.0,20000.0,691.84,110000.0,,12.45,0.0,...,0,0,0,0,1,1,0,0,0,0
6,,,24700.0,24700.0,24700.0,820.28,65000.0,,16.06,1.0,...,0,0,0,0,1,1,0,0,0,0
7,,,17475.0,17475.0,17475.0,597.17,39750.0,,23.58,0.0,...,0,0,0,0,1,0,1,1,0,0
8,,,18000.0,18000.0,18000.0,559.83,75000.0,,25.65,0.0,...,0,0,0,0,1,1,0,0,0,0
9,,,16000.0,16000.0,16000.0,363.07,90000.0,,6.39,0.0,...,0,0,0,0,1,1,0,0,0,0


Attempt Random Forest Classifier again

In [11]:
rfc = ensemble.RandomForestClassifier()
X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

array([0.98325893, 0.98620346, 0.98798442, 0.98843533, 0.98748516,
       0.98750891, 0.98560809, 0.98769801, 0.97382858, 0.98767426])

The crossvalidation shows an average accuracy of 98%. Try removing as many features as possible yet still obtaining an average accuracy of a minimum of 90%.

In [12]:
y2015.dropna(axis=1,thresh=10, inplace=True)

In [13]:
list(y2015)

['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'installment',
 'grade',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'purpose',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'verification_status_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev

In [14]:
y2015dropped = y2015.drop(['funded_amnt_inv', 'emp_length', 'issue_d', 'pymnt_plan', 'purpose', 'dti',
                           'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 
                           'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt_inv', 
                           'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 
                           'collection_recovery_fee', 'last_pymnt_amnt', 'next_pymnt_d', 
                           'collections_12_mths_ex_med',  'mths_since_last_major_derog','policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
                           'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 
                           'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
                           'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
                           'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'avg_cur_bal',
                           'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 
                           'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                           'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq',
                           'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_actv_bc_tl',
                           'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                           'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd',
                           'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75',
                           'total_bal_ex_mort', 'deferral_term', 'hardship_amount', 'hardship_length', 
                           'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',
                           'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'disbursement_method',
                           'debt_settlement_flag', 'settlement_status', 'settlement_amount',
                           'settlement_percentage', 'settlement_term'], axis=1)

In [15]:
rfc = ensemble.RandomForestClassifier()
X = y2015dropped.drop('loan_status', 1)
Y = y2015dropped['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

crossval = cross_val_score(rfc, X, Y, cv=10)
cvscore = 0

for i in crossval:
    cvscore += i

cvscore = cvscore / len(crossval)
print(crossval)
print("Average Score: ", (cvscore * 100).round(3))

[0.88611322 0.91902546 0.92840521 0.93690485 0.93129898 0.92880551
 0.9174959  0.91029995 0.89723799 0.86275441]
Average Score:  91.183


With only 24 of the original 145 columns, we still reach an average crossvalidation score of 91%.