In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Imports

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
#race = "hispanic"
#race = "white"
#race = "mixed"

In [None]:
#X = np.load('../Data/' + race + '/X.npy')
#Y2 = np.load('../Data/' + race + '/Y2.npy')

In [3]:
X_hispanic = np.load('../Data/hispanic/X.npy')
Y_hispanic = np.load('../Data/hispanic/Y2.npy')
X_white = np.load('../Data/white/X.npy')
Y_white = np.load('../Data/white/Y2.npy')
#X_mixed = np.load('../Data/mixed/X.npy')
#Y_mixed = np.load('../Data/mixed/Y2.npy')

In [24]:
X_hispanic.shape

(37575, 301)

In [30]:
X_hispanic[0,281:]

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 93, 0, 4, 30], dtype=object)

In [None]:
size_hispanic=37575

In [7]:
def split_train_test(X, Y):
    # shuffle
    np.random.seed(97)
    idx = np.random.permutation(len(X))
    X = X[idx]
    Y = Y[idx]
    
    #X = X[:size_hispanic]
    #Y = Y[:size_hispanic]

    # split into training and test sets
    TEST_SET_SIZE = int(0.1*len(Y))
    X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
    Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)
    return X_train, X_test, Y_train, Y_test

In [8]:
X_train_h, X_test_h, Y_train_h, Y_test_h = split_train_test(X_hispanic, Y_hispanic)
X_train_w, X_test_w, Y_train_w, Y_test_w = split_train_test(X_white, Y_white)
#X_train_m, X_test_m, Y_train_m, Y_test_m = split_train_test(X_mixed, Y_mixed)

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [9]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def feature_scale(X_train, X_test):
    
    mm_scaler = MinMaxScaler(feature_range=(-1,1))
    X_train[:,-4:] = mm_scaler.fit_transform(X_train[:,-4:])
    X_test[:,-4:] = mm_scaler.transform(X_test[:,-4:])
    
    std_scaler = StandardScaler()
    X_train[:,:-4] = std_scaler.fit_transform(X_train[:,:-4])
    X_test[:,:-4] = std_scaler.transform(X_test[:,:-4])
    return X_train, X_test

X_train_h, X_test_h = feature_scale(X_train_h, X_test_h)
X_train_w, X_test_w = feature_scale(X_train_w, X_test_w)
#X_train_m, X_test_m = feature_scale(X_train_m, X_test_m)



# Results Function

In [10]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier, X_test, Y_test):
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

## Decision Tree

In [None]:

# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[40,50,60], 'min_samples_leaf':[250,260,270,280,290]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=3, scoring=make_scorer(roc_auc_score), verbose=3)
dt_grid_search.fit(X_train_h, Y_train_h.astype(int))

In [None]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

In [None]:
results(dt_grid_search, X_test_h, Y_test_h)

In [None]:
tree_clf_h = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_h.fit(X_train_h, Y_train_h)

tree_clf_w = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_w.fit(X_train_w, Y_train_w)

tree_clf_m = DecisionTreeClassifier(max_depth=130, min_samples_leaf=200)
tree_clf_m.fit(X_train_m, Y_train_m)

In [None]:
results(tree_clf_h, X_test_h, Y_test_h)

In [None]:
results(tree_clf_w, X_test_w, Y_test_w)

In [None]:
results(tree_clf_m, X_test_m, Y_test_m)

In [None]:
d_h = dict(enumerate(tree_clf_h.feature_importances_))
sorted(d_h, key=d_h.get, reverse=True)

In [None]:
d_w = dict(enumerate(tree_clf_w.feature_importances_))
sorted(d_w, key=d_w.get, reverse=True)

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

In [None]:
results(knn_clf)

## Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .5]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, 
                              scoring='accuracy', verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

In [None]:
results(lr_grid_search)

In [None]:
lr = LogisticRegression(C=.05)
lr.fit(X_train, Y_train.astype(int))
results(lr)

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        LogisticRegression(), n_estimators=40,
        max_samples=1000, bootstrap=True, 
        verbose=3
)

bag_clf.fit(X_train, Y_train)

In [None]:
# probably add cross-validation?

In [None]:
results(bag_clf)

## Random Forest

In [None]:

#scoring=make_scorer(roc_auc_score)

param_grid = [{'max_features':[50,300], 'n_estimators':[10,20,30], 'min_samples_leaf':[200,300,400]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
rf_grid_search.fit(X_train_h, Y_train_h)

In [None]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

In [11]:
rf_clf = RandomForestClassifier(n_estimators=30, min_samples_leaf=200, max_features=300)
rf_clf.fit(X_train_h, Y_train_h)
results(rf_clf, X_test_h, Y_test_h)

Test accuracy score: 0.8461538461538461
ROC: 0.8553589540221598
PPV: 0.8798543689320388
NPV: 0.6052060737527115
Sensitivity: 0.9409474367293965
Specificity: 0.41333333333333333
G-Mean: 0.6236384693993927
Confusion matrix:
[[ 279  396]
 [ 182 2900]]


In [26]:
rf_clf.n_features_

301

In [15]:
sorted(enumerate(rf_clf.feature_importances_))

[(0, 0.00035226075684693497),
 (1, 0.0013237360211737826),
 (2, 0.0),
 (3, 2.1734188184355667e-05),
 (4, 0.00013755626287350042),
 (5, 0.0),
 (6, 1.145963028135228e-06),
 (7, 0.00048417719333443133),
 (8, 4.6080510812542675e-05),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0004664315766510421),
 (16, 0.0),
 (17, 0.0011837063811666233),
 (18, 7.994451474984132e-06),
 (19, 0.0013231199832952065),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0016219270457278221),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 1.2105732952010693e-05),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 1.5662288578260866e-05),
 (61, 0.0),
 (62, 0

In [12]:
sorted(enumerate(rf_clf.feature_importances_), key=lambda x:x[1], reverse=True)

[(187, 0.6116557791791272),
 (298, 0.09086089528838262),
 (297, 0.07815148210533035),
 (233, 0.04830339369311962),
 (266, 0.046155417243447606),
 (277, 0.04298960143297676),
 (300, 0.026576554594294907),
 (211, 0.014586639315821302),
 (276, 0.011701273055902916),
 (102, 0.00886224726065714),
 (248, 0.0021732439848461153),
 (144, 0.0018307139486371166),
 (101, 0.0016629596286865335),
 (41, 0.0016219270457278221),
 (1, 0.0013237360211737826),
 (19, 0.0013231199832952065),
 (17, 0.0011837063811666233),
 (299, 0.0009215151612075779),
 (292, 0.0006967917706607091),
 (104, 0.0006858255591990931),
 (169, 0.0006370893247016833),
 (133, 0.000492029090103901),
 (7, 0.00048417719333443133),
 (15, 0.0004664315766510421),
 (203, 0.00045980017571193747),
 (251, 0.0004301333933496313),
 (137, 0.00038278417811572144),
 (0, 0.00035226075684693497),
 (229, 0.00030051115659024245),
 (213, 0.0002899736915702199),
 (218, 0.00028124555087410074),
 (275, 0.0002663552766936968),
 (267, 0.000235259619930824),


In [None]:
results(rf_grid_search, X_test_h, Y_test_h)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_leaf=50, verbose=3)
rf_clf.fit(X_train_m, Y_train_m)


In [None]:
results(rf_clf, X_test_m, Y_test_m)

## Kernel SVM

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(kernel="rbf", gamma=5, C=0.1, verbose=3)
svm_clf.fit(X_train_h, Y_train_h)

In [None]:
results(svm_clf, X_test_h, Y_test_h)

In [None]:
mm_scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train_h)

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1e-5, loss="hinge")
mm_scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train_h)
svm_clf.fit(mm_scaler.transform(X_train_h), Y_train_h)

In [None]:
results(svm_clf, mm_scaler.transform(X_test_h), Y_test_h)