In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
# uncomment the group you want to train on
race = "hispanic"
#race = "white"
#race = "mixed"

In [3]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

In [4]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

TEST_SET_SIZE = int(0.1*len(Y))

X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [5]:
# Features Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])

#mm_scaler = MinMaxScaler(feature_range=(0,1))
#X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
#X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])



In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier,threshold=0.5, X_test=X_test, Y_test=Y_test):
    Y_pred_test = classifier.predict(X_test)
    Y_pred_test = (Y_pred_test>=threshold).astype(int)
    #Y_pred_test = np.rint(Y_pred_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict(X_test))))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

In [9]:
import pickle
with open('hispanic_codes.pkl', 'rb') as f_h:
    hispanic_codes = pickle.load(f_h)

In [40]:
var_lengths = []
for key in hispanic_codes:
    print(key,len(hispanic_codes[key]))
    var_lengths.append(len(hispanic_codes[key]))

(0, 'mar_stat', 0) 7
(1, 'sex', 7) 2
(2, 'primsite', 8) 13
(3, 'histo3v', 21) 78
(4, 'beho3v', 99) 2
(5, 'grade', 100) 5
(6, 'dx_conf', 105) 8
(7, 'csexten', 113) 57
(8, 'cslymphn', 170) 17
(9, 'csmetsdx', 187) 25
(10, 'cstseval', 212) 7
(11, 'csrgeval', 219) 7
(12, 'csmteval', 226) 7
(13, 'surgprif', 233) 33
(14, 'no_surg', 266) 8
(15, 'summ2k', 274) 5
(16, 'reg', 279) 18


In [46]:
var_lengths = [length if length>2 else 1 for length in var_lengths]
var_lengths.extend([1,1,1,1])

[7, 1, 13, 78, 1, 5, 8, 57, 17, 25, 7, 7, 7, 33, 8, 5, 18, 1, 1, 1, 1]

In [36]:
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Activation, Reshape
from keras.layers.embeddings import Embedding

class NN_with_EntityEmbedding(Model):
    
    def __init__(self, X_train, Y_train, X_test, Y_test, var_lengths):
        super().__init__()
        self.var_lengths = var_lengths
        self.epochs=10
        self.__build_keras_model()
        self.fit(X_train, Y_train, X_test, Y_test)
    
    def __build_keras_model(self):
        
        input_model = []
        output_embeddings = []
        
        for length in var_lengths:
            input_vec = Input(shape=(length,))
            output_vec = Embedding