In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [51]:
# Replace the path with the correct path for your data.
y2015 = pd.read_csv('LoanStats3d.csv', skipinitialspace=True,header=1)

# Note the warning about dtypes.

  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
y2015.head()
print(y2015['loan_status'].unique())
# list(y2015)

['Current' 'Fully Paid' 'Charged Off' 'Late (31-120 days)'
 'In Grace Period' 'Default' 'Late (16-30 days)' nan]


### The Blind Approach
Now, as we've seen before, creating a model is the easy part. Let's try just using everything we've got and throwing it without much thought into a Random Forest. SKLearn requires the independent variables to be be numeric, and all we want is dummy variables so let's use get_dummies from Pandas and see what happens off of this kind of naive approach.

In [53]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

# rfc = ensemble.RandomForestClassifier()
# X = y2015.drop('loan_status', 1)
# Y = y2015['loan_status']
# X = pd.get_dummies(X)

# cross_val_score(rfc, X, Y, cv=5)

### Data Cleaning
Well, get_dummies can be a very memory intensive thing, particularly if data are typed poorly. We got a warning about that earlier. Mixed data types get converted to objects, and that could create huge problems. Our dataset is about 400,000 rows. If there's a bad type there its going to see 400,000 distinct values and try to create dummies for all of them. That's bad. Lets look at all our categorical variables and see how many distinct counts there are...

In [54]:
# categorical = y2015.select_dtypes(include=['object'])
# for i in categorical:
#     column = categorical[i]
#     print(i)
#     print(column.nunique())

Well that right there is what's called a problem. Some of these have over a hundred thousand distinct types. Lets drop the ones with over 30 unique values, converting to numeric where it makes sense. In doing this there's a lot of code that gets written to just see if the numeric conversion makes sense. It's a manual process that we'll abstract away and just include the conversion.
You could extract numeric features from the dates, but here we'll just drop them. There's a lot of data, it shouldn't be a huge problem.

In [55]:
# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

In [56]:
#y2015.tail()

In [57]:
# Remove two summary rows at the end that don't actually contain data.
y2015 = y2015[:-2]

In [40]:
#pd.get_dummies(y2015)
print (y2015.shape)
y2015.head()
print(y2015['policy_code'].unique())


(421095, 103)
[ 1.]


In [42]:
rfc = ensemble.RandomForestClassifier()
X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X_dummies = pd.get_dummies(X)
X_dummies = X_dummies.dropna(axis=1)
print(X_dummies.shape)



#cross_val_score(rfc, X_dummies, Y, cv=10)

(421095, 202)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,...,last_credit_pull_d_Nov-2016,last_credit_pull_d_Oct-2015,last_credit_pull_d_Oct-2016,last_credit_pull_d_Sep-2015,last_credit_pull_d_Sep-2016,application_type_INDIVIDUAL,application_type_JOINT,verification_status_joint_Not Verified,verification_status_joint_Source Verified,verification_status_joint_Verified
0,68009401.0,72868139.0,16000.0,16000.0,16000.0,14.85,379.39,48000.0,33.18,0.0,...,0,0,0,0,0,1,0,0,0,0
1,68354783.0,73244544.0,9600.0,9600.0,9600.0,7.49,298.58,60000.0,22.44,0.0,...,0,0,0,0,0,1,0,0,0,0
2,68466916.0,73356753.0,25000.0,25000.0,25000.0,7.49,777.55,109000.0,26.02,0.0,...,0,0,0,0,0,1,0,0,0,0
3,68466961.0,73356799.0,28000.0,28000.0,28000.0,6.49,858.05,92000.0,21.6,0.0,...,0,0,0,0,0,1,0,0,0,0
4,68495092.0,73384866.0,8650.0,8650.0,8650.0,19.89,320.99,55000.0,25.49,0.0,...,0,0,0,0,0,1,0,0,0,0


### DRILL: Third Attempt
So here's your task. Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.
You'll want to do a few things in this process. First, dive into the data that we have and see which features are most important. This can be the raw features or the generated dummies. You may want to use PCA or correlation matricies.
Can you do it without using anything related to payment amount or outstanding principal? How do you know?
Once you've taken a stab at this, review this example solution.

In [47]:
# run PCA (either number of final components or amount of variance explained(how?))
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
sklearn_pca = PCA(n_components=10)
Y_sklearn = sklearn_pca.fit_transform(X_dummies)
#print(Y_sklearn)

cross_val_score(rfc, Y_sklearn, Y, cv=10)


array([ 0.70057705,  0.75669065,  0.75175132,  0.75357982,  0.64419378,
        0.63436238,  0.52917429,  0.54211689,  0.53962524,  0.57654491])

In [48]:
sklearn_pca = PCA(n_components=20)
Y_sklearn = sklearn_pca.fit_transform(X_dummies)
#print(Y_sklearn)

cross_val_score(rfc, Y_sklearn, Y, cv=10)

array([ 0.8300919 ,  0.95575978,  0.95799197,  0.95594975,  0.95818095,
        0.95395393,  0.95426156,  0.95067563,  0.95701427,  0.91447775])

In [49]:
sklearn_pca = PCA(n_components=25)
Y_sklearn = sklearn_pca.fit_transform(X_dummies)
#print(Y_sklearn)

cross_val_score(rfc, Y_sklearn, Y, cv=10)

array([ 0.80171452,  0.95860939,  0.95808696,  0.95958301,  0.96338162,
        0.96143434,  0.96157591,  0.95910613,  0.95701427,  0.93668361])

### By using PCA, I was able to reduce the number of features to 20 and still maintain an average of over 90% accuracy