In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

In [23]:
# load data from disk
X = np.load('../hispanic_colrect/X.npy')
#Y1 = np.load('../hispanic_colrect/Y1.npy')
Y2 = np.load('../hispanic_colrect/Y2.npy')
#Y5 = np.load('../hispanic_colrect/Y5.npy')

In [24]:
len(Y2)

37575

In [25]:
# shuffle
np.random.seed(97)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

# split into training and test sets
TEST_SET_SIZE = int(0.1*len(Y))
X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [7]:
X[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [26]:
len(np.where(Y==0)[0])/len(Y)

0.17996007984031936

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [11]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)



# Results Function

In [27]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier):
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test.astype(int), Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

# Imports

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer

## Decision Tree

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[130,200], 'min_samples_leaf':[100,200,300]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=5, scoring="accuracy", verbose=3)
dt_grid_search.fit(X_train, Y_train.astype(int))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_depth=130, min_samples_leaf=100 .............................
[CV]  max_depth=130, min_samples_leaf=100, score=0.8049767099779358, total=   0.8s
[CV] max_depth=130, min_samples_leaf=100 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  max_depth=130, min_samples_leaf=100, score=0.8121858526419027, total=   0.8s
[CV] max_depth=130, min_samples_leaf=100 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV]  max_depth=130, min_samples_leaf=100, score=0.8203996567365452, total=   0.8s
[CV] max_depth=130, min_samples_leaf=100 .............................
[CV]  max_depth=130, min_samples_leaf=100, score=0.80397204854726, total=   0.9s
[CV] max_depth=130, min_samples_leaf=100 .............................
[CV]  max_depth=130, min_samples_leaf=100, score=0.8027219225110348, total=   0.8s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8079186075018386, total=   0.7s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8089984062768174, total=   0.8s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.8194189040088268, total=   0.8s
[CV] max_depth=130, min_samples_leaf=200 .............................
[CV]  max_depth=130, min_samples_leaf=200, score=0.80728208900331, total=   0.

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   23.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [130, 200], 'min_samples_leaf': [100, 200, 300]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [30]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

0.8088512933676597 {'max_depth': 130, 'min_samples_leaf': 100}
0.8107882800049038 {'max_depth': 130, 'min_samples_leaf': 200}
0.8123574843692534 {'max_depth': 130, 'min_samples_leaf': 300}
0.8088512933676597 {'max_depth': 200, 'min_samples_leaf': 100}
0.8107882800049038 {'max_depth': 200, 'min_samples_leaf': 200}
0.8123574843692534 {'max_depth': 200, 'min_samples_leaf': 300}
Best: {'max_depth': 130, 'min_samples_leaf': 300}


In [31]:
results(dt_grid_search)

Test accuracy score: 0.8046666666666666
ROC: 0.6832633550154026
PPV: 0.8410782538724603
NPV: 0.6287657920310982
Sensitivity: 0.9162831470523778
Specificity: 0.45024356297842727
G-Mean: 0.6423010110734287
Confusion matrix:
[[ 647  790]
 [ 382 4181]]


## K-Nearest Neighbors

In [86]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [87]:
results(knn_clf)

Test accuracy score: 0.7935
ROC: 0.675444953669
PPV: 0.838630806846
NPV: 0.590659340659
Sensitivity: 0.902038132807
Specificity: 0.44885177453
Confusion matrix:
[[ 645  792]
 [ 447 4116]]


## Logistic Regression

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .5]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, 
                              scoring='accuracy', verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.8594235033259423, total=   1.0s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ................. C=0.01, score=0.8539325842696629, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV] ................. C=0.01, score=0.8542067129971906, total=   1.2s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.3s remaining:    0.0s


[CV] ................. C=0.01, score=0.8534673961259795, total=   1.0s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.7s remaining:    0.0s


[CV] ................. C=0.01, score=0.8488836315244714, total=   1.1s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8582409460458241, total=   1.4s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8527498521584861, total=   1.3s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8571639804820346, total=   1.2s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8561289368623392, total=   1.1s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.8518408990093154, total=   1.3s
[CV] C=0.5 ...........................................................
[CV] .................. C=0.5, score=0.8595713229859572, total=   1.5s
[CV] C=0.5 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   24.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.01, 0.1, 0.5]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='accuracy', verbose=5)

In [34]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

0.853983085931 {'C': 0.01}
0.855225028092 {'C': 0.1}
0.856319119995 {'C': 0.5}
Best: {'C': 0.5}


In [35]:
results(lr_grid_search)

Test accuracy score: 0.852541921746
ROC: 0.679298435359
PPV: 0.880036079375
NPV: 0.640371229698
Sensitivity: 0.94970798183
Specificity: 0.408888888889
G-Mean: 0.623157316782
Confusion matrix:
[[ 276  399]
 [ 155 2927]]


In [32]:
lr = LogisticRegression(C=.05)
lr.fit(X_train, Y_train.astype(int))
results(lr)

Test accuracy score: 0.853872770828
ROC: 0.681845122215
PPV: 0.880902255639
NPV: 0.645833333333
Sensitivity: 0.950356911097
Specificity: 0.413333333333
G-Mean: 0.626748905001
Confusion matrix:
[[ 279  396]
 [ 153 2929]]


## Bagging

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        LogisticRegression(), n_estimators=40,
        max_samples=1000, bootstrap=True, 
        verbose=3
)

bag_clf.fit(X_train, Y_train)

Building estimator 1 of 40 for this parallel run (total 40)...
Building estimator 2 of 40 for this parallel run (total 40)...
Building estimator 3 of 40 for this parallel run (total 40)...
Building estimator 4 of 40 for this parallel run (total 40)...
Building estimator 5 of 40 for this parallel run (total 40)...
Building estimator 6 of 40 for this parallel run (total 40)...
Building estimator 7 of 40 for this parallel run (total 40)...
Building estimator 8 of 40 for this parallel run (total 40)...
Building estimator 9 of 40 for this parallel run (total 40)...
Building estimator 10 of 40 for this parallel run (total 40)...
Building estimator 11 of 40 for this parallel run (total 40)...
Building estimator 12 of 40 for this parallel run (total 40)...
Building estimator 13 of 40 for this parallel run (total 40)...
Building estimator 14 of 40 for this parallel run (total 40)...
Building estimator 15 of 40 for this parallel run (total 40)...
Building estimator 16 of 40 for this parallel run

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min finished


BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1000, n_estimators=40, n_jobs=1, oob_score=False,
         random_state=None, verbose=3, warm_start=False)

In [None]:
# probably add cross-validation?

In [14]:
results(bag_clf)

  np.exp(prob, prob)


Test accuracy score: 0.787
ROC: 0.7131245833670757
PPV: 0.8636263006420191
NPV: 0.5536075522589345
Sensitivity: 0.8549200087661626
Specificity: 0.5713291579679889
G-Mean: 0.6988853473484455
Confusion matrix:
[[ 821  616]
 [ 662 3901]]


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


## Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
#scoring=make_scorer(roc_auc_score)

param_grid = [{'max_features':[50,100,150], 'n_estimators':[100,200,300], 'min_samples_leaf':[300,500,800]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring='accuracy', 
            verbose=5
)
rf_grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_features=50, min_samples_leaf=300, n_estimators=100 .........
[CV]  max_features=50, min_samples_leaf=300, n_estimators=100, score=0.7166605369621184, total=   6.8s
[CV] max_features=50, min_samples_leaf=300, n_estimators=100 .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s


[CV]  max_features=50, min_samples_leaf=300, n_estimators=100, score=0.7193085693269584, total=   6.9s
[CV] max_features=50, min_samples_leaf=300, n_estimators=100 .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.4s remaining:    0.0s


[CV]  max_features=50, min_samples_leaf=300, n_estimators=100, score=0.7207061419639573, total=   6.8s
[CV] max_features=50, min_samples_leaf=300, n_estimators=200 .........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   21.6s remaining:    0.0s


[CV]  max_features=50, min_samples_leaf=300, n_estimators=200, score=0.7199705774181684, total=  13.5s
[CV] max_features=50, min_samples_leaf=300, n_estimators=200 .........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   35.8s remaining:    0.0s


[CV]  max_features=50, min_samples_leaf=300, n_estimators=200, score=0.7215152629643251, total=  14.2s
[CV] max_features=50, min_samples_leaf=300, n_estimators=200 .........
[CV]  max_features=50, min_samples_leaf=300, n_estimators=200, score=0.7223979404192717, total=  14.1s
[CV] max_features=50, min_samples_leaf=300, n_estimators=300 .........
[CV]  max_features=50, min_samples_leaf=300, n_estimators=300, score=0.7194556822361162, total=  20.9s
[CV] max_features=50, min_samples_leaf=300, n_estimators=300 .........
[CV]  max_features=50, min_samples_leaf=300, n_estimators=300, score=0.7194556822361162, total=  20.6s
[CV] max_features=50, min_samples_leaf=300, n_estimators=300 .........
[CV]  max_features=50, min_samples_leaf=300, n_estimators=300, score=0.720264803236484, total=  20.3s
[CV] max_features=50, min_samples_leaf=500, n_estimators=100 .........
[CV]  max_features=50, min_samples_leaf=500, n_estimators=100, score=0.7106289076866495, total=   6.0s
[CV] max_features=50, min_sa

KeyboardInterrupt: 

In [47]:
results(rf_grid_search)

Test accuracy score: 0.7083333333333334
ROC: 0.5468141193332928
PPV: 0.4283746556473829
NPV: 0.7468714448236633
Sensitivity: 0.18894289185905225
Specificity: 0.9046853468075333
G-Mean: 0.4134414899938381
Confusion matrix:
[[3939  415]
 [1335  311]]


In [48]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

0.7240897388745863 {'max_features': 50}
0.7206080666911855 {'max_features': 100}
0.72031384087287 {'max_features': 150}
Best: {'max_features': 50}


In [48]:
rf_clf = RandomForestClassifier(n_estimators=25, max_features=20, min_samples_leaf=50, verbose=3)
rf_clf.fit(X_train, Y_train)


building tree 1 of 25
building tree 2 of 25


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


building tree 3 of 25
building tree 4 of 25
building tree 5 of 25
building tree 6 of 25
building tree 7 of 25
building tree 8 of 25
building tree 9 of 25
building tree 10 of 25
building tree 11 of 25
building tree 12 of 25
building tree 13 of 25
building tree 14 of 25
building tree 15 of 25
building tree 16 of 25
building tree 17 of 25
building tree 18 of 25
building tree 19 of 25
building tree 20 of 25
building tree 21 of 25
building tree 22 of 25
building tree 23 of 25
building tree 24 of 25
building tree 25 of 25


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    3.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=20, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=3,
            warm_start=False)

In [49]:
results(rf_clf)

Test accuracy score: 0.845887676338
ROC: 0.649788256784
PPV: 0.869282974329
NPV: 0.630434782609
Sensitivity: 0.955872809864
Specificity: 0.343703703704
G-Mean: 0.573181493961
Confusion matrix:
[[ 232  443]
 [ 136 2946]]


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
