In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Imports

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
#race = "hispanic"
#race = "white"
#race = "mixed"

In [None]:
#X = np.load('../Data/' + race + '/X.npy')
#Y2 = np.load('../Data/' + race + '/Y2.npy')

In [4]:
X_hispanic = np.load('../Data/hispanic/X.npy')
Y_hispanic = np.load('../Data/hispanic/Y2.npy')
X_white = np.load('../Data/white/X.npy')
Y_white = np.load('../Data/white/Y2.npy')
#X_mixed = np.load('../Data/mixed/X.npy')
#Y_mixed = np.load('../Data/mixed/Y2.npy')

In [5]:
X_hispanic.shape

(37575, 301)

In [6]:
X_hispanic[0,281:]

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 93, 0, 4, 30], dtype=object)

In [7]:
size_hispanic=37575

In [105]:
def split_train_test(X, Y):
    # shuffle
    np.random.seed(97)
    idx = np.random.permutation(len(X))
    X = X[idx]
    Y = Y[idx]
    
    #X = X[:size_hispanic]
    #Y = Y[:size_hispanic]

    # split into training and test sets
    TEST_SET_SIZE = int(0.1*len(Y))
    X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
    Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)
    return X_train, X_test, Y_train, Y_test

In [106]:
X_train_h, X_test_h, Y_train_h, Y_test_h = split_train_test(X_hispanic, Y_hispanic)
X_train_w, X_test_w, Y_train_w, Y_test_w = split_train_test(X_white, Y_white)
#X_train_m, X_test_m, Y_train_m, Y_test_m = split_train_test(X_mixed, Y_mixed)

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [107]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def feature_scale(X_train, X_test):
    
    mm_scaler = MinMaxScaler(feature_range=(-1,1))
    X_train[:,-4:] = mm_scaler.fit_transform(X_train[:,-4:])
    X_test[:,-4:] = mm_scaler.transform(X_test[:,-4:])
    
    std_scaler = StandardScaler()
    X_train[:,:-4] = std_scaler.fit_transform(X_train[:,:-4])
    X_test[:,:-4] = std_scaler.transform(X_test[:,:-4])
    return X_train, X_test

X_train_h, X_test_h = feature_scale(X_train_h, X_test_h)
X_train_w, X_test_w = feature_scale(X_train_w, X_test_w)
#X_train_m, X_test_m = feature_scale(X_train_m, X_test_m)



# Results Function

In [99]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier, X_test, Y_test):
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

## Decision Tree

In [None]:

# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[40,50,60], 'min_samples_leaf':[250,260,270,280,290]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=3, scoring=make_scorer(roc_auc_score), verbose=3)
dt_grid_search.fit(X_train_h, Y_train_h.astype(int))

In [None]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

In [None]:
results(dt_grid_search, X_test_h, Y_test_h)

In [None]:
tree_clf_h = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_h.fit(X_train_h, Y_train_h)

tree_clf_w = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_w.fit(X_train_w, Y_train_w)

tree_clf_m = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_m.fit(X_train_m, Y_train_m)

In [None]:
results(tree_clf_h, X_test_h, Y_test_h)

In [None]:
results(tree_clf_w, X_test_w, Y_test_w)

In [None]:
results(tree_clf_m, X_test_m, Y_test_m)

In [None]:
d_h = dict(enumerate(tree_clf_h.feature_importances_))
sorted(d_h, key=d_h.get, reverse=True)

In [None]:
d_w = dict(enumerate(tree_clf_w.feature_importances_))
sorted(d_w, key=d_w.get, reverse=True)

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

In [None]:
results(knn_clf)

## Logistic Regression

In [100]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .75, 1, 1.5, 2]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, 
                              scoring=make_scorer(roc_auc_score), verbose=5
                             )
lr_grid_search.fit(X_train_w, Y_train_w.astype(int))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.6992168367903329, total=   1.5s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV] .................. C=0.01, score=0.700153633445304, total=   1.5s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV] ................... C=0.01, score=0.69466771042928, total=   1.5s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.0s remaining:    0.0s


[CV] .................. C=0.1, score=0.7052212354780437, total=   2.8s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.1s remaining:    0.0s


[CV] .................. C=0.1, score=0.6996581478694397, total=   2.7s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.6951793853867201, total=   2.8s
[CV] C=0.75 ..........................................................
[CV] ................. C=0.75, score=0.7059609083263989, total=   4.2s
[CV] C=0.75 ..........................................................
[CV] ................. C=0.75, score=0.7014997792313875, total=   4.3s
[CV] C=0.75 ..........................................................
[CV] ................. C=0.75, score=0.6966285864256561, total=   4.7s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.7055044957294112, total=   4.0s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.7020662997341228, total=   4.3s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.01, 0.1, 0.75, 1, 1.5, 2]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=5)

In [101]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

0.6980128258006144 {'C': 0.01}
0.7000197327031548 {'C': 0.1}
0.7013632313273677 {'C': 0.75}
0.7012477370479314 {'C': 1}
0.7014969935136673 {'C': 1.5}
0.7016491059322929 {'C': 2}
Best: {'C': 2}


In [None]:
results(lr_grid_search)

In [108]:
lr = LogisticRegression(C=1)
lr.fit(X_train_w, Y_train_w.astype(int))
results(lr, X_test=X_test_w, Y_test=Y_test_w)

Test accuracy score: 0.8540320824562677
ROC: 0.8707343684588752
PPV: 0.884913611462284
NPV: 0.6623953974895398
Sensitivity: 0.9420816509645581
Specificity: 0.481193009118541
G-Mean: 0.6732927331131673
Confusion matrix:
[[ 2533  2731]
 [ 1291 20999]]


In [109]:
sorted(enumerate(lr.coef_[0]), key=lambda x:x[1], reverse=True)

[(363, 0.610851386708917),
 (71, 0.27051883424535456),
 (340, 0.20457990381025412),
 (177, 0.1572662770423993),
 (331, 0.15068680679402444),
 (251, 0.15012767329282642),
 (164, 0.13658806171747137),
 (182, 0.11876830937377733),
 (247, 0.10605701627513948),
 (202, 0.09552288315006163),
 (339, 0.0877107547846889),
 (233, 0.08184656030738441),
 (243, 0.08107293476543993),
 (1, 0.07695169524822658),
 (173, 0.06689424550126884),
 (52, 0.06548688895429705),
 (156, 0.06396656708815265),
 (313, 0.06352297239145206),
 (345, 0.06304098486493086),
 (316, 0.059980020414922224),
 (286, 0.057546888039859266),
 (9, 0.05598542942558552),
 (359, 0.0552996606248446),
 (172, 0.05494482403524912),
 (287, 0.054361535983506945),
 (43, 0.053512307967868246),
 (89, 0.052793537855950805),
 (157, 0.05275843062555647),
 (48, 0.051528615319774915),
 (139, 0.04944210062039059),
 (246, 0.049360743347171644),
 (288, 0.04934570600135513),
 (299, 0.049032428859048655),
 (88, 0.048443431185606116),
 (321, 0.04777357126

## Random Forest

In [42]:

#scoring=make_scorer(roc_auc_score)

param_grid = [{'max_features':[175,200], 'n_estimators':[10,15,20], 'min_samples_leaf':[150,175,200]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
rf_grid_search.fit(X_train_h, Y_train_h)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] max_features=175, min_samples_leaf=150, n_estimators=10 .........
[CV]  max_features=175, min_samples_leaf=150, n_estimators=10, score=0.6793289012051349, total=   1.8s
[CV] max_features=175, min_samples_leaf=150, n_estimators=10 .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV]  max_features=175, min_samples_leaf=150, n_estimators=10, score=0.692053071228758, total=   1.7s
[CV] max_features=175, min_samples_leaf=150, n_estimators=10 .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s


[CV]  max_features=175, min_samples_leaf=150, n_estimators=10, score=0.6642879000996424, total=   1.8s
[CV] max_features=175, min_samples_leaf=150, n_estimators=15 .........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.8s remaining:    0.0s


[CV]  max_features=175, min_samples_leaf=150, n_estimators=15, score=0.6905024537115332, total=   2.5s
[CV] max_features=175, min_samples_leaf=150, n_estimators=15 .........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.5s remaining:    0.0s


[CV]  max_features=175, min_samples_leaf=150, n_estimators=15, score=0.6881822988987676, total=   2.5s
[CV] max_features=175, min_samples_leaf=150, n_estimators=15 .........
[CV]  max_features=175, min_samples_leaf=150, n_estimators=15, score=0.6683329203558037, total=   2.5s
[CV] max_features=175, min_samples_leaf=150, n_estimators=20 .........
[CV]  max_features=175, min_samples_leaf=150, n_estimators=20, score=0.6878157776711931, total=   3.2s
[CV] max_features=175, min_samples_leaf=150, n_estimators=20 .........
[CV]  max_features=175, min_samples_leaf=150, n_estimators=20, score=0.6901658161333959, total=   3.3s
[CV] max_features=175, min_samples_leaf=150, n_estimators=20 .........
[CV]  max_features=175, min_samples_leaf=150, n_estimators=20, score=0.670370427246983, total=   3.3s
[CV] max_features=175, min_samples_leaf=175, n_estimators=10 .........
[CV]  max_features=175, min_samples_leaf=175, n_estimators=10, score=0.6775437730152085, total=   1.7s
[CV] max_features=175, min_s

[CV]  max_features=200, min_samples_leaf=200, n_estimators=20, score=0.6929967920795372, total=   3.5s
[CV] max_features=200, min_samples_leaf=200, n_estimators=20 .........
[CV]  max_features=200, min_samples_leaf=200, n_estimators=20, score=0.6655921785841743, total=   3.7s


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  2.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_features': [175, 200], 'n_estimators': [10, 15, 20], 'min_samples_leaf': [150, 175, 200]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=5)

In [43]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

0.6785570461047503 {'max_features': 175, 'min_samples_leaf': 150, 'n_estimators': 10}
0.6823396384891638 {'max_features': 175, 'min_samples_leaf': 150, 'n_estimators': 15}
0.6827843740873832 {'max_features': 175, 'min_samples_leaf': 150, 'n_estimators': 20}
0.677414981454461 {'max_features': 175, 'min_samples_leaf': 175, 'n_estimators': 10}
0.680654654405189 {'max_features': 175, 'min_samples_leaf': 175, 'n_estimators': 15}
0.6822053698825022 {'max_features': 175, 'min_samples_leaf': 175, 'n_estimators': 20}
0.6791560033538183 {'max_features': 175, 'min_samples_leaf': 200, 'n_estimators': 10}
0.6765716454808399 {'max_features': 175, 'min_samples_leaf': 200, 'n_estimators': 15}
0.6752814314443205 {'max_features': 175, 'min_samples_leaf': 200, 'n_estimators': 20}
0.6825958397716648 {'max_features': 200, 'min_samples_leaf': 150, 'n_estimators': 10}
0.6812836774932842 {'max_features': 200, 'min_samples_leaf': 150, 'n_estimators': 15}
0.6840706106467923 {'max_features': 200, 'min_samples_le

In [46]:
rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225)
rf_clf.fit(X_train_h, Y_train_h)
results(rf_clf, X_test_h, Y_test_h)

Test accuracy score: 0.8445568272557892
ROC: 0.8518840579710145
PPV: 0.8748499399759904
NPV: 0.6070588235294118
Sensitivity: 0.9458144062297209
Specificity: 0.38222222222222224
G-Mean: 0.6012580844852863
Confusion matrix:
[[ 258  417]
 [ 167 2915]]


In [47]:
sorted(enumerate(rf_clf.feature_importances_), key=lambda x:x[1], reverse=True)

[(187, 0.5697810360268841),
 (277, 0.12129826161643897),
 (297, 0.07440201343781659),
 (298, 0.0646508076851262),
 (233, 0.05641264141965241),
 (266, 0.04393150533278482),
 (300, 0.021550127872390086),
 (102, 0.010357623284759672),
 (276, 0.008565160114061339),
 (170, 0.007650877963974014),
 (211, 0.007505700745796593),
 (275, 0.002222382464448982),
 (267, 0.0021908561452082614),
 (41, 0.0009972427842070656),
 (19, 0.0009095025274065983),
 (101, 0.0007702506440262185),
 (1, 0.0007457322059716467),
 (218, 0.0006380097189094962),
 (248, 0.0006366913920819013),
 (104, 0.0005872028224967013),
 (144, 0.0005664631453068588),
 (299, 0.0004599625276868865),
 (7, 0.00036093351322826866),
 (133, 0.0003598373306445186),
 (292, 0.0002711136152987014),
 (213, 0.00026975087245193223),
 (99, 0.0002462104498946162),
 (137, 0.00019970244030970163),
 (251, 0.0001726802777839547),
 (203, 0.00015843505275603084),
 (4, 0.00013910323714702317),
 (278, 0.00013665999426966976),
 (169, 9.945334980593786e-05),


In [None]:
results(rf_grid_search, X_test_h, Y_test_h)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_leaf=50, verbose=3)
rf_clf.fit(X_train_m, Y_train_m)


In [None]:
results(rf_clf, X_test_m, Y_test_m)

## Kernel SVM

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(kernel="rbf", gamma=5, C=0.1, verbose=3)
svm_clf.fit(X_train_h, Y_train_h)

In [None]:
results(svm_clf, X_test_h, Y_test_h)

In [None]:
mm_scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train_h)

## Linear SVM

In [61]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=.01)
svm_clf.fit(X_train_h, Y_train_h)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [62]:
results(svm_clf, X_test_h, Y_test_h)

Test accuracy score: 0.8514772424807027
ROC: 0.6763354723964717
PPV: 0.878978978978979
NPV: 0.6370023419203747
Sensitivity: 0.9497079818299805
Specificity: 0.40296296296296297
G-Mean: 0.6186252034210898
Confusion matrix:
[[ 272  403]
 [ 155 2927]]


In [54]:
sorted(enumerate(svm_clf.coef_[0]), key=lambda x:x[1], reverse=True)

[(50, 0.12191538013950175),
 (187, 0.0848940088933371),
 (299, 0.07784953698289407),
 (123, 0.051141067770208916),
 (269, 0.046320907747084535),
 (40, 0.04453483168525132),
 (35, 0.04364683474237397),
 (126, 0.04076491288364893),
 (266, 0.03556333886225712),
 (121, 0.030437792079564308),
 (91, 0.029007493232082925),
 (59, 0.026652221677057625),
 (133, 0.025239734400257566),
 (265, 0.024098400561465602),
 (108, 0.023748322233822162),
 (231, 0.023445893376109448),
 (211, 0.022503990963415137),
 (280, 0.021375114922663554),
 (295, 0.020928129674443033),
 (278, 0.02081058063150811),
 (168, 0.020174917357648432),
 (83, 0.020145322095191327),
 (9, 0.020023630005344604),
 (190, 0.01975800900099815),
 (100, 0.019735476855558633),
 (225, 0.019253653417910892),
 (55, 0.01845265069850628),
 (185, 0.018381599239679577),
 (114, 0.017952663807103895),
 (87, 0.01770286756497439),
 (116, 0.016202596484654955),
 (125, 0.015790515652467585),
 (88, 0.015729191868451304),
 (274, 0.015636927329712166),
 (8

# Ensemble

In [93]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
clf1 = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225)
clf2 = LogisticRegression(C=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[
    ('rf', clf1), ('lr', clf2), ('nb', clf3)
], voting='soft')
eclf.fit(X_train_h, Y_train_h)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=225, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=300, min_samples_split=2,
            min_wei...r='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('nb', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [94]:
results(eclf, X_test_h, Y_test_h)

Test accuracy score: 0.8261911099281342
ROC: 0.8564116614992668
PPV: 0.9305210918114144
NPV: 0.5117521367521367
Sensitivity: 0.8517196625567813
Specificity: 0.7096296296296296
G-Mean: 0.7774352118912816
Confusion matrix:
[[ 479  196]
 [ 457 2625]]


  if diff:
