In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectFromModel
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc, classification_report

In [None]:
data = pd.read_csv('./data/data_processed.csv')

data['target'] = data['readmitted'].isin(['<30']).astype(int)

data = data.drop(columns='readmitted')

In [6]:
X = data.drop(columns = 'target')
y = data['target']

X['strat_col'] = X['race'].astype(str) + "_" + data['target'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)
X_train = X_train.drop(columns = ['strat_col' , 'race'])
X_test = X_test.drop(columns = ['strat_col' , 'race'])

In [7]:
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score,roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier

We begin by seeing how random forest does with default parameters.

In [None]:
rf = RandomForestClassifier()

n_splits = 5
current_split = 0
metrics={n:[] for n in range(0,n_splits)}

kfold = KFold(n_splits, shuffle = True)

for s_train, s_test in kfold.split(X_train, y_train):

    train_features = X_train.iloc[s_train]
    test_features = X_train.iloc[s_test]

    train_target = y_train.iloc[s_train]
    test_target = y_train.iloc[s_test]

    rf.fit(train_features, train_target)
    metrics[current_split].append(accuracy_score(rf.predict(test_features), test_target))
    metrics[current_split].append(confusion_matrix(rf.predict(test_features),test_target))

    probs = rf.predict_proba(test_features)
    #y_pred = (probs[:,1]>=0.5)

    false_pos_rate, true_pos_rate, threshold = roc_curve(test_target, probs[:,1])

    metrics[current_split].append(auc(false_pos_rate, true_pos_rate))

    current_split += 1

for k in range(0,5):
    print('The accuracy of split ', k, ' is ', metrics[k][0],' the confusion matrix is ', metrics[k][1], ' and the auc is ', metrics[k][2])

The accuracy of split  0  is  0.8845411249839105  the confusion matrix is  [[13660  1706]
 [   88    84]]  and the auc is  0.7917213148728185
The accuracy of split  1  is  0.8862714809808844  the confusion matrix is  [[13679  1671]
 [   96    91]]  and the auc is  0.7963594414036188
The accuracy of split  2  is  0.8855634935959323  the confusion matrix is  [[13660  1688]
 [   90    99]]  and the auc is  0.7937245154397925
The accuracy of split  3  is  0.8859496685331789  the confusion matrix is  [[13684  1669]
 [  103    81]]  and the auc is  0.7891138650281321
The accuracy of split  4  is  0.8836326189096995  the confusion matrix is  [[13650  1733]
 [   75    79]]  and the auc is  0.7948615383378167


With default parameters, we see random forest has an auc score of roughly .79. We now see if we can improve this with different choices of parameter. We begin by exploring the effect on auc of increasing the number of estimators for random forest.

In [9]:
gsearch_n_estimators = GridSearchCV(RandomForestClassifier(), # first put the model object here
                          param_grid = {'n_estimators':[250, 500, 1000]}, # and n_estimators here
                          scoring = 'roc_auc', # put the score we are trying to optimize here 
                          cv = 5) # put the number of cv splits here

gsearch_n_estimators.fit(X_train,y_train)

In [10]:
print('Best AUC score: ', gsearch_n_estimators.best_score_)
print('Best parameters: ', gsearch_n_estimators.best_params_)

Best AUC score:  0.7987757998380152
Best parameters:  {'n_estimators': 1000}


In [11]:
gsearch_n_estimators.cv_results_

{'mean_fit_time': array([ 27.22330966,  55.18220601, 107.06148558]),
 'std_fit_time': array([0.82448514, 2.16333533, 2.00390769]),
 'mean_score_time': array([0.88545756, 1.73260474, 3.17166777]),
 'std_score_time': array([0.138949  , 0.29357878, 0.10416987]),
 'param_n_estimators': masked_array(data=[250, 500, 1000],
              mask=[False, False, False],
        fill_value=999999),
 'params': [{'n_estimators': 250},
  {'n_estimators': 500},
  {'n_estimators': 1000}],
 'split0_test_score': array([0.78930283, 0.79443066, 0.79619557]),
 'split1_test_score': array([0.79751904, 0.79773349, 0.80075314]),
 'split2_test_score': array([0.79414008, 0.79882909, 0.79852915]),
 'split3_test_score': array([0.79219   , 0.79551871, 0.79557596]),
 'split4_test_score': array([0.80056968, 0.80115539, 0.80282518]),
 'mean_test_score': array([0.79474433, 0.79753347, 0.7987758 ]),
 'std_test_score': array([0.00395306, 0.00238846, 0.00273001]),
 'rank_test_score': array([3, 2, 1], dtype=int32)}

Changing n_estimators doesn't seem to have much effect on auc. So moving forward, we just use n_estimators = 1000.

Nest we try to understand how changing max_depth effects auc.

In [12]:
gsearch_max_depth = GridSearchCV(RandomForestClassifier(n_estimators=1000), # first put the model object here
                          param_grid = {'max_depth':[5,7,10]}, # and n_estimators here
                          scoring = 'roc_auc', # put the score we are trying to optimize here 
                          cv = 5) # put the number of cv splits here

gsearch_max_depth.fit(X_train,y_train)

In [13]:
print('Best AUC score: ', gsearch_max_depth.best_score_)
print('Best parameters: ', gsearch_max_depth.best_params_)

Best AUC score:  0.795626470039051
Best parameters:  {'max_depth': 10}


In [14]:
gsearch_max_depth.cv_results_

{'mean_fit_time': array([27.8115871 , 36.62850637, 52.80447364]),
 'std_fit_time': array([1.04186103, 0.53188646, 2.02841384]),
 'mean_score_time': array([0.48786221, 0.65830765, 1.0436986 ]),
 'std_score_time': array([0.01070829, 0.02412128, 0.07442586]),
 'param_max_depth': masked_array(data=[5, 7, 10],
              mask=[False, False, False],
        fill_value=999999),
 'params': [{'max_depth': 5}, {'max_depth': 7}, {'max_depth': 10}],
 'split0_test_score': array([0.76837812, 0.78009072, 0.79197499]),
 'split1_test_score': array([0.76880832, 0.78214858, 0.79468365]),
 'split2_test_score': array([0.7729239 , 0.78467211, 0.79653365]),
 'split3_test_score': array([0.76743145, 0.78071107, 0.79253034]),
 'split4_test_score': array([0.78038588, 0.79116291, 0.80240972]),
 'mean_test_score': array([0.77158553, 0.78375708, 0.79562647]),
 'std_test_score': array([0.00478527, 0.0040247 , 0.00375882]),
 'rank_test_score': array([3, 2, 1], dtype=int32)}

In [15]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=10)
n_splits = 5
current_split = 0
metrics={n:[] for n in range(0,n_splits)}

kfold = KFold(n_splits, shuffle = True)

for s_train, s_test in kfold.split(X_train, y_train):

    train_features = X_train.iloc[s_train]
    test_features = X_train.iloc[s_test]

    train_target = y_train.iloc[s_train]
    test_target = y_train.iloc[s_test]

    rf.fit(train_features, train_target)
    metrics[current_split].append(accuracy_score(rf.predict(test_features), test_target))
    metrics[current_split].append(confusion_matrix(rf.predict(test_features),test_target))

    probs = rf.predict_proba(test_features)
    #y_pred = (probs[:,1]>=0.5)

    false_pos_rate, true_pos_rate, threshold = roc_curve(test_target, probs[:,1])

    metrics[current_split].append(auc(false_pos_rate, true_pos_rate))

    current_split += 1

for k in range(0,5):
    print('The accuracy of split ', k, ' is ', metrics[k][0],' the confusion matrix is ', metrics[k][1], ' and the auc is ', metrics[k][2])

The accuracy of split  0  is  0.8888531342515125  the confusion matrix is  [[13795  1713]
 [   14    16]]  and the auc is  0.7957566253071472
The accuracy of split  1  is  0.8875587307717062  the confusion matrix is  [[13769  1735]
 [   12    21]]  and the auc is  0.801770586719459
The accuracy of split  2  is  0.8851129561691446  the confusion matrix is  [[13732  1768]
 [   17    20]]  and the auc is  0.7962864250611352
The accuracy of split  3  is  0.8831820814829118  the confusion matrix is  [[13701  1792]
 [   23    21]]  and the auc is  0.7917847525313071
The accuracy of split  4  is  0.8834395314410761  the confusion matrix is  [[13703  1792]
 [   19    23]]  and the auc is  0.7949716989427607


In [16]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=15)
n_splits = 5
current_split = 0
metrics={n:[] for n in range(0,n_splits)}

kfold = KFold(n_splits, shuffle = True)

for s_train, s_test in kfold.split(X_train, y_train):

    train_features = X_train.iloc[s_train]
    test_features = X_train.iloc[s_test]

    train_target = y_train.iloc[s_train]
    test_target = y_train.iloc[s_test]

    rf.fit(train_features, train_target)
    metrics[current_split].append(accuracy_score(rf.predict(test_features), test_target))
    metrics[current_split].append(confusion_matrix(rf.predict(test_features),test_target))

    probs = rf.predict_proba(test_features)
    #y_pred = (probs[:,1]>=0.5)

    false_pos_rate, true_pos_rate, threshold = roc_curve(test_target, probs[:,1])

    metrics[current_split].append(auc(false_pos_rate, true_pos_rate))

    current_split += 1

for k in range(0,5):
    print('The accuracy of split ', k, ' is ', metrics[k][0],' the confusion matrix is ', metrics[k][1], ' and the auc is ', metrics[k][2])

The accuracy of split  0  is  0.880293474063586  the confusion matrix is  [[13637  1829]
 [   31    41]]  and the auc is  0.8018902029644166
The accuracy of split  1  is  0.8849198687005213  the confusion matrix is  [[13710  1756]
 [   32    39]]  and the auc is  0.804443121933896
The accuracy of split  2  is  0.8874300057926241  the confusion matrix is  [[13750  1713]
 [   36    38]]  and the auc is  0.8027997596946322
The accuracy of split  3  is  0.8857565810645556  the confusion matrix is  [[13733  1740]
 [   35    29]]  and the auc is  0.8048312272598425
The accuracy of split  4  is  0.8898757803951857  the confusion matrix is  [[13775  1665]
 [   46    51]]  and the auc is  0.8070595504391901
