#Random Forest Classifier

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
from scipy.stats import sem

def mean_score(text, scores):
    return (str(text) + ' {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [3]:
def test_forest(forest, df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = train_test_split(df, train_size=0.75, random_state=1)
        X = train.drop(['SeriousDlqin2yrs'], axis=1)
        y = train.SeriousDlqin2yrs
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        X_test = scaler.fit_transform(X_test)
        forest = forest.fit(X,y)
        score = forest.score(X_test, y_test)
        probas = forest.predict_proba(X_test).T[1]
        auc_scores.append(roc_auc_score(y_test, probas))
        accs.append(score)
    print(mean_score('roc auc score: ', auc_scores))
    print(mean_score('accuracy: ', accs))

In [4]:
df = pd.read_csv('./data/cs-train_clean.csv').drop('Unnamed: 0', axis=1)
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [49]:
test_forest(forest, df, 10)

roc auc score:  0.83709276 (+/-0.00154)
accuracy:  0.93463729 (+/-0.00064)


Features sorted by importance:

In [None]:
print df.columns[1]
print df.columns[4]
print df.columns[5]
print df.columns[2]
print df.columns[6]
print df.columns[7]
print df.columns[3]
print df.columns[9]
print df.columns[8]
print df.columns[10]

RevolvingUtilizationOfUnsecuredLines
DebtRatio
MonthlyIncome
age
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberOfTime30-59DaysPastDueNotWorse
NumberOfTime60-89DaysPastDueNotWorse
NumberRealEstateLoansOrLines
NumberOfDependents


##Cropped Data

In [5]:
df_cropped = pd.read_csv('./data/cs-train_cropped.csv')
test_forest(forest, df_cropped, 10)

roc auc score:  0.82345631 (+/-0.00373)
accuracy:  0.93084780 (+/-0.00112)


##Oversampled Data

In [None]:
df_oversampled = pd.read_csv('./data/cs-train_oversampled.csv')
test_forest(forest, df_oversampled, 10)

##Undersampled Data

In [None]:
df_undersampled = pd.read_csv('./data/cs-train_undersampled.csv')
test_forest(forest, df_undersampled, 10)