In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

In [37]:
# uncomment the group you want to train on
#race = "hispanic"
race = "white"
#race = "mixed"

In [38]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

In [5]:
size_hispanic = 37575

In [6]:
X.shape

(313122, 374)

In [39]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

#if race != 'hispanic':
#   X = X[:size_hispanic]
#    Y = Y[:size_hispanic]

TEST_SET_SIZE = int(0.1*len(Y))

X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [5]:
X.shape

(37575, 301)

In [40]:
# Features Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])

mm_scaler = MinMaxScaler(feature_range=(-1,1))
X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])



In [41]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier,threshold=0.5, X_test=X_test, Y_test=Y_test):
    Y_pred_test = classifier.predict(X_test)
    Y_pred_test = (Y_pred_test>=threshold).astype(int)
    #Y_pred_test = np.rint(Y_pred_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict(X_test))))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))

In [33]:
X_train[0,-4:]

array([0.65, 0.0, 0.0, 0.053535353535353526], dtype=object)

## Undersample

In [17]:
from imblearn.under_sampling import RandomUnderSampler
us = RandomUnderSampler(ratio={0:6087, 1:20000})
X_train_res, Y_train_res = us.fit_sample(X_train, Y_train)

In [46]:
from tensorflow import keras
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.models import Sequential

model = Sequential()
#model.add(Dropout(0.1, input_shape=(X.shape[1],)))
#model.add(Dense(400, activation=keras.layers.LeakyReLU(alpha=0.1)))
model.add(Dense(400, input_dim=X.shape[1], activation=keras.layers.LeakyReLU(alpha=0.1)))
model.add(Dropout(0.1))

model.add(Dense(400, activation=keras.layers.LeakyReLU(alpha=0.1)))
model.add(Dropout(0.1))

model.add(Dense(400, activation=keras.layers.LeakyReLU(alpha=0.1)))
model.add(Dropout(0.1))

model.add(Dense(400, activation=keras.layers.LeakyReLU(alpha=0.1)))
model.add(Dropout(0.1))


model.add(Dense(1, activation='sigmoid'))

In [47]:
#with tf.Session(graph=graph) as sess:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=5, batch_size=50)
scores = model.evaluate(X_test, Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

acc: 85.31%


In [45]:
results(model)

Test accuracy score: 0.8572621035058431
ROC: 0.8752464359525596
PPV: 0.8902845424250192
NPV: 0.6709257473481196
Sensitivity: 0.9385218213754898
Specificity: 0.5200897028592787
G-Mean: 0.6986526570522203
Confusion matrix:
[[ 2783  2568]
 [ 1365 20838]]


In [32]:
X_h[:,-4:] = std_scaler.transform(X_h[:,-4:])
X_h[:,:-4] = mm_scaler.transform(X_h[:,:-4])




In [34]:
idx_h = np.random.permutation(len(X_h))
X_h = X_h[idx_h]
Y_h = Y_h[idx_h]
results(model, threshold=0.5, X_test=X_h[:TEST_SET_SIZE], Y_test=Y_h[:TEST_SET_SIZE])

Test accuracy score: 0.850412563215
ROC: 0.875111794603
PPV: 0.921802518224
NPV: 0.558863328823
Sensitivity: 0.895109395109
Specificity: 0.636363636364
G-Mean: 0.754728474099
Confusion matrix:
[[ 413  236]
 [ 326 2782]]
