In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# Replace the path with the correct path for your data.
y2015 = pd.read_csv(
    'https://www.dropbox.com/s/0so14yudedjmm5m/LoanStats3d.csv?dl=1',
    skipinitialspace=True,
    header=1
)
# Note the warning about dtypes.

  interactivity=interactivity, compiler=compiler, result=result)


## Data cleaning

In [3]:
categorical = y2015.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

id
421097
term
2
int_rate
110
grade
7
sub_grade
35
emp_title
120812
emp_length
11
home_ownership
4
verification_status
3
issue_d
12
loan_status
7
pymnt_plan
1
url
421095
desc
34
purpose
14
title
27
zip_code
914
addr_state
49
earliest_cr_line
668
revol_util
1211
initial_list_status
2
last_pymnt_d
25
next_pymnt_d
4
last_credit_pull_d
26
application_type
2
verification_status_joint
3


In [4]:
# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)
y2015 = y2015[:-2]

## Full Dataset model - Speed

In [5]:
%%time
rfc = ensemble.RandomForestClassifier()
X = y2015.drop(['loan_status'], 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
#There are 400,000+ rows so its ok if we don't impute the columns for missingness. We can just drop them and still have
#rich enough information
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

Wall time: 4min 9s


## Improved/Simpler Model

In [6]:
#variables related to time, but have the tendency to contribute little information
#Too many dummy variables created
y2015.drop(
['last_pymnt_d', 'next_pymnt_d', 'title', 'issue_d', 'last_pymnt_amnt', 'last_credit_pull_d', 'member_id', 'id', 'emp_length'],
axis=1, inplace=True)


In [7]:
print(len(y2015.columns))
y2015.columns

94


Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'pymnt_plan', 'purpose', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'open_acc_6m', 'open_il_6m', 'open_il_12m',
       'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
       'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
       'total_rev_

### Fitting model

In [8]:
rfc = ensemble.RandomForestClassifier()
X = y2015.drop(['loan_status'], 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
#There are 400,000+ rows so its ok if we don't impute the columns for missingness. We can just drop them and still have
#rich enough information
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

array([0.95402626, 0.96380993, 0.96202892, 0.96103156, 0.96060318,
       0.96060318, 0.96050725, 0.96050725, 0.96022039, 0.96026695])

In [9]:
rfc.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
feat_labels = X.columns
clean_col = []
for feature in zip(feat_labels, rfc.feature_importances_):
    if (feature[1]>.009):
        print(feature)
        clean_col.append(feature[0])
remove_loan = ['loan_amnt', 'funded_amnt_inv', 'funded_amnt']
clean_col = [x for x in clean_col if x not in remove_loan]

('loan_amnt', 0.013337085689145836)
('funded_amnt', 0.014375654766305923)
('funded_amnt_inv', 0.013981163958360718)
('installment', 0.017154757786885196)
('out_prncp', 0.20026302160138093)
('out_prncp_inv', 0.29968508605666405)
('total_pymnt', 0.07776455984917977)
('total_pymnt_inv', 0.04571895671044777)
('total_rec_prncp', 0.10992021476989758)
('total_rec_int', 0.03338422546180047)
('recoveries', 0.017379432909123686)
('collection_recovery_fee', 0.0265874404442304)


In [11]:
#No loan_amnt', 'funded_amnt_inv', 'funded_amnt'
print(clean_col)
X_new = y2015.loc[:,clean_col]
X_new.dropna(axis=1, inplace=True)

['installment', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'recoveries', 'collection_recovery_fee']


In [12]:
cross_val_score(rfc, X_new, Y, cv=10)

array([0.79290447, 0.96062786, 0.94621358, 0.9537888 , 0.95089052,
       0.94208027, 0.91324895, 0.93388587, 0.94497352, 0.9602432 ])

In [13]:
feat_labels = X_new.columns
v3_col = []
for feature in zip(feat_labels, rfc.feature_importances_):
    if (feature[1]>.005):
        print(feature)
        v3_col.append(feature[0])


('installment', 0.013337085689145836)
('out_prncp', 0.014375654766305923)
('out_prncp_inv', 0.013981163958360718)
('total_pymnt', 0.007890454342431321)
('total_pymnt_inv', 0.017154757786885196)


In [15]:
%%time
rfc = ensemble.RandomForestClassifier()
X_v3 = y2015.loc[:, v3_col]
X_v3 = pd.get_dummies(X_v3)
X_v3 = pd.concat([X_v3, y2015['out_prncp']], axis=1 )
X_v3.dropna(axis=1, inplace=True)
rfc.fit(X, Y)
cross_val_score(rfc, X_v3, Y, cv=10)

Wall time: 4min 35s
