#Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import pandas as pd
import numpy as np
from pprint import pprint
from print_scores import print_scores

In [8]:
def random_forest_cv(df, n_iter, n_estimators):
    X = df.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df.SeriousDlqin2yrs
    auc_scores = cross_val_score(RandomForestClassifier(n_estimators=n_estimators), X, y, scoring='roc_auc', cv=n_iter, n_jobs=-1)
    acc_scores = cross_val_score(RandomForestClassifier(n_estimators=n_estimators), X, y, scoring='accuracy', cv=n_iter, n_jobs=-1)
    print print_scores("Accuracy: ", acc_scores)    
    print print_scores("ROC AUC: " , auc_scores)

In [9]:
df = pd.read_csv('./data/cs-train_clean.csv').drop(['Unnamed: 0'], axis=1)
random_forest_cv(df, 10, 100)

Accuracy:  0.93570641 (+/-0.00127)
ROC AUC:  0.84033864 (+/-0.00531)


The Random Forest Classifier also gives us feature importances. Features sorted by importance:

In [11]:
print df.columns[1]
print df.columns[4]
print df.columns[5]
print df.columns[2]
print df.columns[6]
print df.columns[7]
print df.columns[3]
print df.columns[9]
print df.columns[8]
print df.columns[10]

RevolvingUtilizationOfUnsecuredLines
DebtRatio
MonthlyIncome
age
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberOfTime30-59DaysPastDueNotWorse
NumberOfTime60-89DaysPastDueNotWorse
NumberRealEstateLoansOrLines
NumberOfDependents


##Cropped Data

In [10]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv')
random_forest_cv(df_cropped, 10, 100)

Accuracy:  0.84794846 (+/-0.25748)
ROC AUC:  0.77564735 (+/-0.17118)


##Oversampled Data

In [12]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv')
random_forest_cv(df_oversampled, 10, 100)

Accuracy:  0.78727052 (+/-0.17876)
ROC AUC:  0.99710297 (+/-0.00591)


##Undersampled Data

In [11]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv')
random_forest_cv(df_undersampled, 10, 100)

Accuracy:  0.63133705 (+/-0.12076)
ROC AUC:  0.66327441 (+/-0.19364)
