In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

In [2]:
# load data from disk
X = np.load('../hispanic_colrect/X.npy')
Y1 = np.load('../hispanic_colrect/Y1.npy')
Y2 = np.load('../hispanic_colrect/Y2.npy')
Y5 = np.load('../hispanic_colrect/Y5.npy')

In [3]:
# shuffle
np.random.seed(97)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y5[idx]

# split into training and test sets
TEST_SET_SIZE = 6000
X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [7]:
X[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [38]:
len(np.where(Y==0)[0])/len(Y)

0.2372769049909159

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [8]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)



# Results Function

In [147]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier):
    Y_pred_test = classifier.predict(std_scaler.transform(X_test))
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test.astype(int), Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

# Imports

In [122]:
from sklearn.model_selection import GridSearchCv

ImportError: cannot import name 'GridSearchCv'

## Decision Tree

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[130,200], 'min_samples_leaf':[200,400]}]

from sklearn.cross_validation import ShuffleSplit
tree_clf_reg = DecisionTreeClassifier()
grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=5, scoring="accuracy", verbose=3)
grid_search.fit(X_train, Y_train.astype(int))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8087766609463104, total=   1.3s
[CV] max_depth=130, min_samples_leaf=200 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  max_depth=130, min_samples_leaf=200, score=0.8087532180948879, total=   1.2s
[CV] max_depth=130, min_samples_leaf=200 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV]  max_depth=130, min_samples_leaf=200, score=0.8186833394630378, total=   1.3s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8078950594581341, total=   1.4s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8113045610593428, total=   1.3s
[CV] max_depth=130, min_samples_leaf=400 .............................
[CV]  max_depth=130, min_samples_leaf=400, score=0.8115959794067173, total=   1.3s
[CV] max_depth=130, min_samples_leaf=400 .............................
[CV]  max_depth=130, min_samples_leaf=400, score=0.8093661885497119, total=   1.3s
[CV] max_depth=130, min_samples_leaf=400 .............................
[CV]  max_depth=130, min_samples_leaf=400, score=0.8196640921907564, total=   1.3s
[CV] max_depth=130, min_samples_leaf=400 .............................
[CV]  max_depth=130, min_samples_leaf=400, score=0.8083854358219934, total= 

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   29.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [130, 200], 'min_samples_leaf': [200, 400]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [39]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(grid_search.best_params_))

0.811082505823 {'max_depth': 130, 'min_samples_leaf': 200}
0.811327694005 {'max_depth': 130, 'min_samples_leaf': 400}
0.811082505823 {'max_depth': 200, 'min_samples_leaf': 200}
0.811327694005 {'max_depth': 200, 'min_samples_leaf': 400}
Best: {'max_depth': 130, 'min_samples_leaf': 400}


In [40]:
results(grid_search)

0.808166666667
ROC: 0.699151567226
[[ 704  733]
 [ 418 4145]]
PPV: 0.849733497335
NPV: 0.627450980392
Sensitivity: 0.908393600701
Specificity: 0.489909533751


## K-Nearest Neighbors

In [86]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [87]:
results(knn_clf)

Test accuracy score: 0.7935
ROC: 0.675444953669
PPV: 0.838630806846
NPV: 0.590659340659
Sensitivity: 0.902038132807
Specificity: 0.44885177453
Confusion matrix:
[[ 645  792]
 [ 447 4116]]


## Logistic Regression

In [125]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .5]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, 
                              scoring='accuracy', verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.8184604069624908, total=   3.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s


[CV] ................. C=0.01, score=0.8186833394630378, total=   2.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.0s remaining:    0.0s


[CV] ................. C=0.01, score=0.8238322912835602, total=   3.3s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.3s remaining:    0.0s


[CV] ................. C=0.01, score=0.8163540517347064, total=   3.4s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.8s remaining:    0.0s


[CV] ................. C=0.01, score=0.8120402157920549, total=   3.6s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8173571953910272, total=   5.2s
[CV] C=0.1 ...........................................................
[CV] ................... C=0.1, score=0.818560745372073, total=   5.1s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8228515385558416, total=   6.1s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8156184871889175, total=   5.9s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8117949975478176, total=   5.4s
[CV] C=0.5 ...........................................................
[CV] .................. C=0.5, score=0.8172346163275312, total=  11.2s
[CV] C=0.5 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.01, 0.1, 0.5]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='accuracy', verbose=5)

In [126]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

0.817874218463 {'C': 0.01}
0.81723672919 {'C': 0.1}
0.81708961628 {'C': 0.5}
Best: {'C': 0.01}


In [127]:
results(lr_grid_search)

Test accuracy score: 0.765833333333
ROC: 0.739731213715
PPV: 0.88987654321
NPV: 0.508205128205
Sensitivity: 0.78983125137
Specificity: 0.689631176061
Confusion matrix:
[[ 991  446]
 [ 959 3604]]


In [88]:
lr = LogisticRegression(C=.1)
lr.fit(X_train, Y_train.astype(int))
results(lr)

Test accuracy score: 0.7505
ROC: 0.740853520442
PPV: 0.896739130435
NPV: 0.48595505618
Sensitivity: 0.759368836292
Specificity: 0.722338204593
Confusion matrix:
[[1038  399]
 [1098 3465]]


## Bagging

In [84]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, 
        verbose=3
)

bag_clf.fit(X_train, Y_train)

Building estimator 1 of 500 for this parallel run (total 500)...
Building estimator 2 of 500 for this parallel run (total 500)...
Building estimator 3 of 500 for this parallel run (total 500)...
Building estimator 4 of 500 for this parallel run (total 500)...
Building estimator 5 of 500 for this parallel run (total 500)...
Building estimator 6 of 500 for this parallel run (total 500)...
Building estimator 7 of 500 for this parallel run (total 500)...
Building estimator 8 of 500 for this parallel run (total 500)...
Building estimator 9 of 500 for this parallel run (total 500)...
Building estimator 10 of 500 for this parallel run (total 500)...
Building estimator 11 of 500 for this parallel run (total 500)...
Building estimator 12 of 500 for this parallel run (total 500)...
Building estimator 13 of 500 for this parallel run (total 500)...
Building estimator 14 of 500 for this parallel run (total 500)...
Building estimator 15 of 500 for this parallel run (total 500)...
Building estimator 

Building estimator 125 of 500 for this parallel run (total 500)...
Building estimator 126 of 500 for this parallel run (total 500)...
Building estimator 127 of 500 for this parallel run (total 500)...
Building estimator 128 of 500 for this parallel run (total 500)...
Building estimator 129 of 500 for this parallel run (total 500)...
Building estimator 130 of 500 for this parallel run (total 500)...
Building estimator 131 of 500 for this parallel run (total 500)...
Building estimator 132 of 500 for this parallel run (total 500)...
Building estimator 133 of 500 for this parallel run (total 500)...
Building estimator 134 of 500 for this parallel run (total 500)...
Building estimator 135 of 500 for this parallel run (total 500)...
Building estimator 136 of 500 for this parallel run (total 500)...
Building estimator 137 of 500 for this parallel run (total 500)...
Building estimator 138 of 500 for this parallel run (total 500)...
Building estimator 139 of 500 for this parallel run (total 500

Building estimator 248 of 500 for this parallel run (total 500)...
Building estimator 249 of 500 for this parallel run (total 500)...
Building estimator 250 of 500 for this parallel run (total 500)...
Building estimator 251 of 500 for this parallel run (total 500)...
Building estimator 252 of 500 for this parallel run (total 500)...
Building estimator 253 of 500 for this parallel run (total 500)...
Building estimator 254 of 500 for this parallel run (total 500)...
Building estimator 255 of 500 for this parallel run (total 500)...
Building estimator 256 of 500 for this parallel run (total 500)...
Building estimator 257 of 500 for this parallel run (total 500)...
Building estimator 258 of 500 for this parallel run (total 500)...
Building estimator 259 of 500 for this parallel run (total 500)...
Building estimator 260 of 500 for this parallel run (total 500)...
Building estimator 261 of 500 for this parallel run (total 500)...
Building estimator 262 of 500 for this parallel run (total 500

Building estimator 371 of 500 for this parallel run (total 500)...
Building estimator 372 of 500 for this parallel run (total 500)...
Building estimator 373 of 500 for this parallel run (total 500)...
Building estimator 374 of 500 for this parallel run (total 500)...
Building estimator 375 of 500 for this parallel run (total 500)...
Building estimator 376 of 500 for this parallel run (total 500)...
Building estimator 377 of 500 for this parallel run (total 500)...
Building estimator 378 of 500 for this parallel run (total 500)...
Building estimator 379 of 500 for this parallel run (total 500)...
Building estimator 380 of 500 for this parallel run (total 500)...
Building estimator 381 of 500 for this parallel run (total 500)...
Building estimator 382 of 500 for this parallel run (total 500)...
Building estimator 383 of 500 for this parallel run (total 500)...
Building estimator 384 of 500 for this parallel run (total 500)...
Building estimator 385 of 500 for this parallel run (total 500

Building estimator 494 of 500 for this parallel run (total 500)...
Building estimator 495 of 500 for this parallel run (total 500)...
Building estimator 496 of 500 for this parallel run (total 500)...
Building estimator 497 of 500 for this parallel run (total 500)...
Building estimator 498 of 500 for this parallel run (total 500)...
Building estimator 499 of 500 for this parallel run (total 500)...
Building estimator 500 of 500 for this parallel run (total 500)...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min finished


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=False, bootstrap_features=False, max_features=1.0,
         max_samples=100, n_estimators=500, n_jobs=1, oob_score=False,
         random_state=None, verbose=3, warm_start=False)

In [None]:
# probably add cross-validation?

In [85]:
results(bag_clf)

Test accuracy score: 0.795666666667
ROC: 0.600829399769
PPV: 0.800143910775
NPV: 0.739229024943
Sensitivity: 0.97479728249
Specificity: 0.226861517049
Confusion matrix:
[[ 326 1111]
 [ 115 4448]]


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.7s finished


## Random Forest

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
#scoring=make_scorer(roc_auc_score)

param_grid = [{'max_depth':[10,12,15], 'max_leaf_nodes':[15,20,25]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring='accuracy', 
            verbose=5
)
rf_grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=10, max_leaf_nodes=15 .................................
[CV]  max_depth=10, max_leaf_nodes=15, score=0.8068549573403943, total=   0.7s
[CV] max_depth=10, max_leaf_nodes=15 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  max_depth=10, max_leaf_nodes=15, score=0.8085325487311512, total=   0.7s
[CV] max_depth=10, max_leaf_nodes=15 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.6s remaining:    0.0s


[CV]  max_depth=10, max_leaf_nodes=15, score=0.805870236869207, total=   0.7s
[CV] max_depth=10, max_leaf_nodes=20 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.4s remaining:    0.0s


[CV]  max_depth=10, max_leaf_nodes=20, score=0.8107531626949103, total=   0.7s
[CV] max_depth=10, max_leaf_nodes=20 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.2s remaining:    0.0s


[CV]  max_depth=10, max_leaf_nodes=20, score=0.8032364840014711, total=   0.8s
[CV] max_depth=10, max_leaf_nodes=20 .................................
[CV]  max_depth=10, max_leaf_nodes=20, score=0.8089598352214212, total=   0.7s
[CV] max_depth=10, max_leaf_nodes=25 .................................
[CV]  max_depth=10, max_leaf_nodes=25, score=0.8001618122977346, total=   0.8s
[CV] max_depth=10, max_leaf_nodes=25 .................................
[CV]  max_depth=10, max_leaf_nodes=25, score=0.8104450165502023, total=   0.9s
[CV] max_depth=10, max_leaf_nodes=25 .................................
[CV]  max_depth=10, max_leaf_nodes=25, score=0.8109460055907017, total=   1.0s
[CV] max_depth=12, max_leaf_nodes=15 .................................
[CV]  max_depth=12, max_leaf_nodes=15, score=0.8015592821418064, total=   0.7s
[CV] max_depth=12, max_leaf_nodes=15 .................................
[CV]  max_depth=12, max_leaf_nodes=15, score=0.8107392423685178, total=   0.6s
[CV] max_depth=12, ma

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   22.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [10, 12, 15], 'max_leaf_nodes': [15, 20, 25]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=5)

In [142]:
results(rf_grid_search)

Test accuracy score: 0.8035
ROC: 0.632438599116
PPV: 0.81426448737
NPV: 0.709415584416
Sensitivity: 0.96077142231
Specificity: 0.304105775922
Confusion matrix:
[[ 437 1000]
 [ 179 4384]]


In [138]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

0.661380453308 {'max_depth': 10, 'max_leaf_nodes': 15}
0.660149778926 {'max_depth': 10, 'max_leaf_nodes': 20}
0.678057906728 {'max_depth': 10, 'max_leaf_nodes': 25}
0.650504871067 {'max_depth': 12, 'max_leaf_nodes': 15}
0.66044661105 {'max_depth': 12, 'max_leaf_nodes': 20}
0.669719028379 {'max_depth': 12, 'max_leaf_nodes': 25}
0.657017960063 {'max_depth': 15, 'max_leaf_nodes': 15}
0.664223137274 {'max_depth': 15, 'max_leaf_nodes': 20}
0.695917984857 {'max_depth': 15, 'max_leaf_nodes': 25}
Best: {'max_depth': 15, 'max_leaf_nodes': 25}


In [145]:
rf_clf = RandomForestClassifier(n_estimators=500, max_depth=12, max_leaf_nodes=15)
rf_clf.fit(X_train, Y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=15,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [144]:
results(rf_clf)

Test accuracy score: 0.801
ROC: 0.611248597117
PPV: 0.804446050967
NPV: 0.76017130621
Sensitivity: 0.975454744686
Specificity: 0.247042449548
Confusion matrix:
[[ 355 1082]
 [ 112 4451]]


In [146]:
results(rf_clf)

Test accuracy score: 0.802666666667
ROC: 0.622117540698
PPV: 0.80937557224
NPV: 0.734693877551
Sensitivity: 0.968660968661
Specificity: 0.275574112735
Confusion matrix:
[[ 396 1041]
 [ 143 4420]]
