In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
#race = "white"
race = "hispanic"
#race = "mixed"
if race == "white":
    data = pd.read_csv("../Data/white/white_2year.csv")
if race == "hispanic":
    data = pd.read_csv("../Data/hispanic/hispanic_2year.csv")
if race == "mixed":
    data = pd.read_csv("../Data/mixed/mixed_2year.csv")
data_matrix = data.as_matrix()

In [3]:
X_old = np.load('../Data/' + race + '/X.npy')

In [4]:
# Y2 will be indicator of survival; 21040 and 21050 are SEER codes for colon and rectum cancer
Y2 = np.logical_not(np.logical_and(data_matrix[:,-2].astype(int)<24, np.isin(data_matrix[:,-3], (21040, 21050))))
X_raw = data_matrix[:,1:23]
X_cont = X_raw[:,-5:]
X_cat = X_raw[:,:-5]
X_cat.shape

(37575, 17)

In [5]:
# Fill missing values in three eval columns with 9: unknown
# remember: index shifted over one since we deleted primsite <-- no longer the case
X_cat[:,12] = np.array([9 if np.isnan(x) else x for x in X_cat[:,12]])
X_cat[:,10] = np.array([9 if np.isnan(x) else x for x in X_cat[:,10]])
X_cat[:,11] = np.array([9 if np.isnan(x) else x for x in X_cat[:,11]])

In [6]:
from sklearn.preprocessing import LabelEncoder

all_features = []
for feature in range(X_cat.shape[1]):
    enc = LabelEncoder()
    all_features.append(enc.fit_transform(X_cat[:,feature]))
X_cat_enc = np.column_stack(all_features)

In [7]:
X_con = X_old[:,-4:].astype(int)
X = np.column_stack((X_cat_enc, X_con))

In [8]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(X))
X = X[idx]
Y = Y2[idx]

TEST_SET_SIZE = int(0.1*len(Y))

X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)

In [None]:
# Features Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std_scaler = StandardScaler()
X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])

#mm_scaler = MinMaxScaler(feature_range=(0,1))
#X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
#X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])

In [9]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier,threshold=0.5, X_test=X_test, Y_test=Y_test):
    Y_pred_test = classifier.predict(X_test)
    Y_pred_test = (Y_pred_test>=threshold).astype(int)
    #Y_pred_test = np.rint(Y_pred_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict(X_test))))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

In [11]:
import pickle
with open('hispanic_codes.pkl', 'rb') as f_h:
    hispanic_codes = pickle.load(f_h)

In [32]:
var_lengths = []
for key in hispanic_codes:
    print(key,len(hispanic_codes[key]))
    var_lengths.append(len(hispanic_codes[key]))

(0, 'mar_stat', 0) 7
(1, 'sex', 7) 2
(2, 'primsite', 8) 13
(3, 'histo3v', 21) 78
(4, 'beho3v', 99) 2
(5, 'grade', 100) 5
(6, 'dx_conf', 105) 8
(7, 'csexten', 113) 57
(8, 'cslymphn', 170) 17
(9, 'csmetsdx', 187) 25
(10, 'cstseval', 212) 7
(11, 'csrgeval', 219) 7
(12, 'csmteval', 226) 7
(13, 'surgprif', 233) 33
(14, 'no_surg', 266) 8
(15, 'summ2k', 274) 5
(16, 'reg', 279) 18


In [76]:
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Activation, Reshape, Dropout
from keras.layers.embeddings import Embedding

class NN_with_EntityEmbedding():
    
    def __init__(self, X_train, Y_train, X_test, Y_test, var_lengths):
        self.var_lengths = var_lengths
        self.epochs=5
        self.__build_keras_model()
        self.fit(X_train, Y_train, X_test, Y_test)
        
    def split_features(self,X):
        X_list = []
        for idx in range(X.shape[1]):
            X_list.append(X[:,idx])
        return X_list
    
    def __build_keras_model(self):
        
        input_model = []
        output_embeddings = []
        
        for length in var_lengths:
            input_vec = Input(shape=(1,))
            output_vec = Embedding(length, length-1)(input_vec)
            output_vec = Reshape(target_shape=(length-1,))(output_vec)
            
            input_model.append(input_vec)
            output_embeddings.append(output_vec)
            
        for i in [1,2,3,4]:
            input_vec = Input(shape=(1,))
            input_model.append(input_vec)
            output_embeddings.append(input_vec)
            
        output_model = Concatenate()(output_embeddings)
        output_model = Dense(400,kernel_initializer='glorot_normal')(output_model)
        output_model = Activation('relu')(output_model)
        #output_model = Dropout(0.2)(output_model)
        #output_model = Dense(200)(output_model)
        #output_model = Activation('relu')(output_model)
        #output_model = Dropout(0.2)(output_model)
        #output_model = Dense(100)(output_model)
        #output_model = Activation('relu')(output_model)
        #output_model = Dropout(0.2)(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)
        
        self.model = Model(inputs=input_model, outputs=output_model)
        self.model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
        
    def fit(self, X_train, Y_train, X_test, Y_test):
        
        self.model.fit(self.split_features(X_train), Y_train, epochs=self.epochs)
        scores = self.model.evaluate(self.split_features(X_test), Y_test)
        print("\n%s: %.2f%%" % (self.model.metrics_names[1], scores[1]*100))
        results(self.model, X_test=self.split_features(X_test), Y_test=Y_test)
        

In [79]:
model = NN_with_EntityEmbedding(X_train,Y_train, X_test, Y_test, var_lengths)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

acc: 85.25%
Test accuracy score: 0.852541921746074
ROC: 0.8659267453651815
PPV: 0.8722842043452731
NPV: 0.6609686609686609
Sensitivity: 0.9614886731391585
Specificity: 0.34782608695652173
G-Mean: 0.5782999592175427
Confusion matrix:
[[ 232  435]
 [ 119 2971]]
