In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

In [21]:
# uncomment the group you want to train on
#race = "hispanic"
race = "white"
#race = "mixed"

In [22]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

In [23]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

#if race != 'hispanic':
#   X = X[:size_hispanic]
#    Y = Y[:size_hispanic]

TEST_SET_SIZE = int(0.1*len(Y))

X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [24]:
# Features Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])

mm_scaler = MinMaxScaler(feature_range=(-1,1))
X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])



In [25]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier,X_test=X_test, Y_test=Y_test):
    Y_pred_test = classifier.predict(X_test)
    #Y_pred_test = (Y_pred_test>=threshold).astype(int)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    #print("PPV: " + str(ppv))
    #print("NPV: " + str(npv))
    #print("Sensitivity: " + str(sensitivity))
    #print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    #print("Confusion matrix:\n" + str(matrix))
    
    return str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1]))[:6], str(sensitivity)[:6], str(specificity)[:6], str(g_mean)[:6]

## Autoencoder

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

input_dim = X_train.shape[1]
encoding_dim = 150
hidden_dim1 = 250

input_vec = Input(shape=(input_dim,))
hidden1 = Dense(hidden_dim1, activation='relu')(input_vec)
encoded = Dense(encoding_dim, activation='relu')(hidden1)
hidden2 = Dense(hidden_dim1, activation='relu')(encoded)
decoded = Dense(input_dim, activation='tanh')(hidden2)

noise = np.random.normal(size=X.shape)

autoencoder = Model(input_vec, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

autoencoder.fit(X+0.25*noise,X,epochs=7,batch_size=250)

In [None]:
encoder = Model(input_vec, encoded)
X_train_enc = encoder.predict(X_train)
X_test_enc = encoder.predict(X_test)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

print('Logistic Regression')
lr = LogisticRegression(C=1)
lr.fit(X_train_enc, Y_train)
results(lr, X_test=X_test_enc, Y_test=Y_test)
print()
print('Random Forest')
rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, random_state=42)
rf_clf.fit(X_train_enc, Y_train)
results(rf_clf, X_test=X_test_enc, Y_test=Y_test)
print()
print('AdaBoost')
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
ab_clf.fit(X_train_enc, Y_train)
results(ab_clf, X_test=X_test_enc, Y_test=Y_test)

Logistic Regression
Test accuracy score: 0.8488155443172745
ROC: 0.8395263533281903
G-Mean: 0.5871243659823799

Random Forest
Test accuracy score: 0.8458876763375033
ROC: 0.8370326972436112
G-Mean: 0.5337918571118088

AdaBoost


KeyboardInterrupt: 

In [26]:
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

input_dim = X_train.shape[1]
for encoding_dim in [50,100,150,200,250,300,350]:
    for hidden_dim in np.arange(encoding_dim, 400, 50):
        
        
        input_vec = Input(shape=(input_dim,))
        hidden1 = Dense(hidden_dim, activation='relu')(input_vec)
        encoded = Dense(encoding_dim, activation='relu')(hidden1)
        hidden2 = Dense(hidden_dim, activation='relu')(encoded)
        decoded = Dense(input_dim, activation='tanh')(hidden2)

        noise = np.random.normal(size=X.shape)

        autoencoder = Model(input_vec, decoded)
        autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
        autoencoder.fit(X,X,epochs=7,batch_size=250)
        
        encoder = Model(input_vec, encoded)
        X_train_enc = encoder.predict(X_train)
        X_test_enc = encoder.predict(X_test)
        
        lr = LogisticRegression(C=1)
        lr.fit(X_train_enc, Y_train)
        lr_o1, _, _, lr_o2 = results(lr, X_test=X_test_enc)
        
        rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, random_state=42)
        rf_clf.fit(X_train_enc, Y_train)
        rf_o1, _, _, rf_o2 = results(rf_clf, X_test=X_test_enc)
        
        ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
        ab_clf.fit(X_train_enc, Y_train)
        ab_o1, _, _, ab_o2 = results(ab_clf, X_test=X_test_enc)
        
        fd = open('autoencoder_white.csv','a')
        output_arr = [encoding_dim, hidden_dim, lr_o1, lr_o2, rf_o1, rf_o2, ab_o1, ab_o2]
        fd.write(','.join(map(str,output_arr))+'\n')
        fd.close()

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.8248167235247151
ROC: 0.8133268822663354
G-Mean: 0.5273482904688744
Test accuracy score: 0.835087464614938
ROC: 0.8347305510838544
G-Mean: 0.6145307828611396
Test accuracy score: 0.830950134281774
ROC: 0.8293635291480972
G-Mean: 0.6022409727563083
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.8383537780358569
ROC: 0.8281236405353085
G-Mean: 0.6209612721661195
Test accuracy score: 0.8414749219714016
ROC: 0.8381055523137774
G-Mean: 0.6304212690329266
Test accuracy score: 0.8393699644334761
ROC: 0.8334842824429041
G-Mean: 0.6320285286902932
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.840495027945126
ROC: 0.8444509069584585
G-Mean: 0.6194150977989875
Test accuracy score: 0.8451404514770995
ROC: 0.8516462193918465
G-Mean: 0.6502796349199156
Test accuracy score: 0.8419830151702112
ROC: 0.847566427056208
G-

Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.8390796254627277
ROC: 0.8465558785718363
G-Mean: 0.6103237108075641
Test accuracy score: 0.8355229730710605
ROC: 0.8433356561517658
G-Mean: 0.5585815440584676
Test accuracy score: 0.8371561297815199
ROC: 0.8455920061378227
G-Mean: 0.6126659203414255
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.844704943020977
ROC: 0.8540076252110197
G-Mean: 0.6485360021632057
Test accuracy score: 0.8448864048776947
ROC: 0.8553021522839831
G-Mean: 0.6454817871137687
Test accuracy score: 0.8454307904478479
ROC: 0.8564068146006658
G-Mean: 0.6673876660658112
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test accuracy score: 0.843035493939174
ROC: 0.8448382874546602
G-Mean: 0.6366472503275741
Test accuracy score: 0.8403498584597517
ROC: 0.8496027670737655
G-Mean: 0.6143404884043285
Test accuracy score: 0.8418741380561806
ROC: 0.8463166233073051
G-Mean: 0.

KeyboardInterrupt: 

## PCA

In [None]:
from sklearn.decomposition import PCA


for components in [150,200,250,300]:
    pca = PCA(n_components=components)
    X_train_red = pca.fit_transform(X_train)
    X_test_red = pca.transform(X_test)
    '''
    print('Logistic Regression')
    lr = LogisticRegression(C=1)
    lr.fit(X_train_red, Y_train)
    results(lr, X_test=X_test_red, Y_test=Y_test)
    print()
    print('Random Forest')
    rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, random_state=42)
    rf_clf.fit(X_train_red, Y_train)
    results(rf_clf, X_test=X_test_red, Y_test=Y_test)
    print()
    '''

    print('AdaBoost')
    ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                               algorithm="SAMME.R", learning_rate=1)
    ab_clf.fit(X_train_red, Y_train)
    results(ab_clf, X_test=X_test_red, Y_test=Y_test)

In [None]:
pca = PCA(n_components=X.shape[1])
pca.fit(X)
variance_captured = []
for components in range(0,X.shape[1], 5):
    variance_captured.append([components,sum(pca.explained_variance_ratio_[:components])])
variance_captured_np = np.array(variance_captured)

In [None]:
plt.plot(variance_captured_np[:,0], variance_captured_np[:,1])
plt.xlabel('n_components')
plt.ylabel('total variance captured')
plt.show()