In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

In [2]:
# uncomment the group you want to train on
race = "hispanic"
#race = "white"
#race = "mixed"

In [3]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

In [4]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

#if race != 'hispanic':
#   X = X[:size_hispanic]
#    Y = Y[:size_hispanic]

TEST_SET_SIZE = int(0.1*len(Y))

X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [5]:
# Features Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])

mm_scaler = MinMaxScaler(feature_range=(-1,1))
X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])



In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier,X_test=X_test, Y_test=Y_test):
    Y_pred_test = classifier.predict(X_test)
    #Y_pred_test = (Y_pred_test>=threshold).astype(int)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    #print("PPV: " + str(ppv))
    #print("NPV: " + str(npv))
    #print("Sensitivity: " + str(sensitivity))
    #print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    #print("Confusion matrix:\n" + str(matrix))

## Autoencoder

In [12]:
def vae_loss(x, x_decoded_mean):
    xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1)
    return xent_loss + kl_loss

In [17]:
from keras.layers import Input, Dense
from keras.models import Model

input_dim = X_train.shape[1]
encoding_dim = 150
hidden_dim1 = 250

input_vec = Input(shape=(input_dim,))
hidden1 = Dense(hidden_dim1, activation='relu')(input_vec)
encoded = Dense(encoding_dim, activation='relu')(hidden1)
hidden2 = Dense(hidden_dim1, activation='relu')(encoded)
decoded = Dense(input_dim, activation='tanh')(hidden2)

noise = np.random.normal(size=X.shape)

autoencoder = Model(input_vec, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

autoencoder.fit(X+0.25*noise,X,epochs=7,batch_size=250)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x18354dbbe0>

In [18]:
encoder = Model(input_vec, encoded)
X_train_enc = encoder.predict(X_train)
X_test_enc = encoder.predict(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

print('Logistic Regression')
lr = LogisticRegression(C=1)
lr.fit(X_train_enc, Y_train)
results(lr, X_test=X_test_enc, Y_test=Y_test)
print()
print('Random Forest')
rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, random_state=42)
rf_clf.fit(X_train_enc, Y_train)
results(rf_clf, X_test=X_test_enc, Y_test=Y_test)
print()
print('AdaBoost')
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
ab_clf.fit(X_train_enc, Y_train)
results(ab_clf, X_test=X_test_enc, Y_test=Y_test)

Logistic Regression
Test accuracy score: 0.85520361991
ROC: 0.853397087864
G-Mean: 0.623002526093

Random Forest
Test accuracy score: 0.848815544317
ROC: 0.841626031644
G-Mean: 0.532232078953

AdaBoost
Test accuracy score: 0.852541921746
ROC: 0.842258725006
G-Mean: 0.636222574117


## PCA

In [None]:
from sklearn.decomposition import PCA


for components in [150,200,250,300]:
    pca = PCA(n_components=components)
    X_train_red = pca.fit_transform(X_train)
    X_test_red = pca.transform(X_test)
    '''
    print('Logistic Regression')
    lr = LogisticRegression(C=1)
    lr.fit(X_train_red, Y_train)
    results(lr, X_test=X_test_red, Y_test=Y_test)
    print()
    print('Random Forest')
    rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, random_state=42)
    rf_clf.fit(X_train_red, Y_train)
    results(rf_clf, X_test=X_test_red, Y_test=Y_test)
    print()
    '''

    print('AdaBoost')
    ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                               algorithm="SAMME.R", learning_rate=1)
    ab_clf.fit(X_train_red, Y_train)
    results(ab_clf, X_test=X_test_red, Y_test=Y_test)

In [None]:
pca = PCA(n_components=X.shape[1])
pca.fit(X)
variance_captured = []
for components in range(0,X.shape[1], 5):
    variance_captured.append([components,sum(pca.explained_variance_ratio_[:components])])
variance_captured_np = np.array(variance_captured)

In [None]:
plt.plot(variance_captured_np[:,0], variance_captured_np[:,1])
plt.xlabel('n_components')
plt.ylabel('total variance captured')
plt.show()