In [1]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec

### Version 1: Fixed input embeddings from Word2Vec

In [2]:
control_data = pd.read_csv("./cleaned_data/control_w_time.csv", index_col = 0)
case_data = pd.read_csv("./cleaned_data/case_w_time.csv", index_col = 0)
all_events = pd.read_csv("./cleaned_data/events_id_w_time.csv", index_col = 0)
word2vec_model = Word2Vec.load("./word2vec_model/w2vmodel_wt")

In [3]:
#construct an input list of arrays with embeddings for each patient
control_temp = control_data.groupby("SUBJECT_ID").apply(lambda x: x.EVE_INDEX.values)
case_temp = case_data.groupby("SUBJECT_ID").apply(lambda x: x.EVE_INDEX.values)

In [4]:
control_patients = control_temp.index.values
case_patients = case_temp.index.values
all_patients = np.concatenate([control_patients,case_patients])

In [5]:
#construct labels
Y_control = np.zeros(len(control_patients))
Y_case = np.ones(len(case_patients))
Y = np.concatenate([Y_control,Y_case])

In [6]:
#find maximum number of events
c_max = control_data.groupby("SUBJECT_ID")["EVE_INDEX"].count().max()
ca_max = case_data.groupby("SUBJECT_ID")["EVE_INDEX"].count().max()
max_num_event_patient = np.max([c_max,ca_max])

In [7]:
#contruct training set of sequences with paddings, so all the sequence has the same length of max_length, 
#with 0s padded before in shorter sequences
from keras.preprocessing.sequence import pad_sequences

X_control = [np.array(events).astype("int") for events in control_temp]
X_case = [np.array(events).astype("int") for events in case_temp]
X_all = np.concatenate([X_control,X_case])

X = pad_sequences(X_all, maxlen=None)

Using TensorFlow backend.


In [8]:
print X[0].shape[0]
print X.shape
print Y.shape

495
(1953, 495)
(1953,)


In [17]:
#shuffle
np.random.seed(seed=6)
shuffled_index = np.random.permutation(len(all_patients))

#split train, dev, test set 7:1:2
train_index = shuffled_index[:int(len(all_patients)*0.7)]
dev_index =  shuffled_index[int(len(all_patients)*0.7):int(len(all_patients)*0.8)]
test_index =  shuffled_index[int(len(all_patients)*0.8):]

In [18]:
print shuffled_index.shape
print train_index.shape, dev_index.shape, test_index.shape

(1953,)
(1367,) (195,) (391,)


In [19]:
all_patients_shuffle = all_patients[shuffled_index]

Y_train = Y[train_index]
Y_dev = Y[dev_index]
Y_test = Y[test_index]

X_train = X[train_index]
X_dev = X[dev_index]
X_test = X[test_index]

In [20]:
#contruct embedding matrix dim of (number of different events, dim of embedding)
num_events = len(all_events)
dim_embedding = len(word2vec_model.wv["1"])
#adding the dummy row for padding at index "0"
embedding_matrix = np.zeros((num_events+1, dim_embedding))
for i in range(1, num_events+1):
    embedding_matrix[i] = word2vec_model.wv[str(i)]

In [21]:
print embedding_matrix.shape

(4909, 100)


In [32]:
#set up the model
from keras.layers import Conv1D, Dense, Input,GlobalMaxPooling1D, concatenate, Embedding,Masking
from keras.optimizers import SGD, Adam, rmsprop
from keras.models import Model

input_events = Input(shape=(max_num_event_patient,))

embedding = Embedding(num_events+1, dim_embedding, input_length=max_num_event_patient,trainable=False)(input_events)

x_2 = Conv1D(filters = 3, kernel_size = 2,padding = "valid", activation='relu')(embedding)
x_3 = Conv1D(filters = 3, kernel_size = 3,padding = "valid", activation='relu')(embedding)
x_4 = Conv1D(filters = 3, kernel_size = 4,padding = "valid", activation='relu')(embedding)
x_5 = Conv1D(filters = 3, kernel_size = 5,padding = "valid", activation='relu')(embedding)

pool_2 = GlobalMaxPooling1D()(x_2)
pool_3 = GlobalMaxPooling1D()(x_3)
pool_4 = GlobalMaxPooling1D()(x_4)
pool_5 = GlobalMaxPooling1D()(x_5)

patient_embed = concatenate([pool_2, pool_3,pool_4,pool_5])

dense1 = Dense(4, activation = "relu")(patient_embed)
output = Dense(1, activation = "sigmoid" )(dense1)

model = Model(inputs=input_events, outputs=output)
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [33]:
model.fit(X_train,Y_train, epochs=100,batch_size =64,shuffle=True,validation_data=(X_dev, Y_dev))

Train on 1367 samples, validate on 195 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100

KeyboardInterrupt: 

In [34]:
model.evaluate(X_test, Y_test)



[0.28035629748383445, 0.94629156010230175]