In [13]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe

# import glove
# from glove import Corpus

import collections
import gc 

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Dropout, Input, concatenate, merge, Activation, Concatenate, LSTM, GRU
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Conv1D, BatchNormalization, GRU, Convolution1D, LSTM
from keras.layers import UpSampling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPool1D, merge

from keras.optimizers import Adam


from keras.utils import np_utils
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
import tensorflow as tf

from keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau


from sklearn.utils import class_weight
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')


from keras.models import load_model
from keras.utils import CustomObjectScope
from keras.initializers import glorot_uniform

In [14]:
## Load Data ##

type_of_ner = "new"

x_train_lstm = pd.read_pickle("data/"+type_of_ner+"_x_train.pkl")
x_dev_lstm = pd.read_pickle("data/"+type_of_ner+"_x_dev.pkl")
x_test_lstm = pd.read_pickle("data/"+type_of_ner+"_x_test.pkl")

y_train = pd.read_pickle("data/"+type_of_ner+"_y_train.pkl")
y_dev = pd.read_pickle("data/"+type_of_ner+"_y_dev.pkl")
y_test = pd.read_pickle("data/"+type_of_ner+"_y_test.pkl")


ner_word2vec = pd.read_pickle("data/"+type_of_ner+"_ner_word2vec_limited_dict.pkl")
ner_fasttext = pd.read_pickle("data/"+type_of_ner+"_ner_fasttext_limited_dict.pkl")
ner_concat = pd.read_pickle("data/"+type_of_ner+"_ner_combined_limited_dict.pkl")

train_ids = pd.read_pickle("data/"+type_of_ner+"_train_ids.pkl")
dev_ids = pd.read_pickle("data/"+type_of_ner+"_dev_ids.pkl")
test_ids = pd.read_pickle("data/"+type_of_ner+"_test_ids.pkl")

In [15]:
def create_dataset(dict_of_ner):
    temp_data = []
    for k, v in sorted(dict_of_ner.items()):
        temp = []
        for embed in v:
            temp.append(embed)
        temp_data.append(np.mean(temp, axis = 0)) 
    return np.asarray(temp_data)
    
def make_prediction_cnn(model, test_data):
    probs = model.predict(test_data)
    y_pred = [1 if i>=0.5 else 0 for i in probs]
    return probs, y_pred

def printComparison(modelName, resultsFile, model_output):
    stats = pd.read_pickle(resultsFile)
    print("--------------------------")
    print(f"modelName: {modelName}, auc: {model_output['auc']==stats['auc']}, auprc: {model_output['auprc']==stats['auprc']}, acc: {model_output['acc']==stats['acc']} F1: {model_output['F1']==stats['F1']}")
    print(stats)
    #print(model_output)

def get_model_output(problem, probs, predictions):
    ground_truth = y_test[problem].values
    auc = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    model_output = {}    
    model_output['auc'] = auc
    model_output['auprc'] = auprc
    model_output['acc'] = acc
    model_output['F1'] = F1
    return model_output

    
def get_subvector_data(size, embed_name, data):
    if embed_name == "concat":
        vector_size = 200
    else:
        vector_size = 100

    x_data = {}
    for k, v in data.items():
        number_of_additional_vector = len(v) - size
        vector = []
        for i in v:
            vector.append(i)
        if number_of_additional_vector < 0: 
            number_of_additional_vector = np.abs(number_of_additional_vector)

            temp = vector[:size]
            for i in range(0, number_of_additional_vector):
                temp.append(np.zeros(vector_size))
            x_data[k] = np.asarray(temp)
        else:
            x_data[k] = np.asarray(vector[:size])

    return x_data

In [16]:
embedding_types = ['word2vec', 'fasttext', 'concat']

embedding_dict = [ner_word2vec, ner_fasttext, ner_concat]

target_problems = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']

sequence_model = "GRU"
sequence_hidden_unit = 256
ner_representation_limit = 64

for embed_dict, embed_name in zip(embedding_dict, embedding_types):    
    print("=============================")
    print ("Embedding: ", embed_name)

    temp_train_ner = dict((k, embed_dict[k]) for k in train_ids)
    temp_dev_ner = dict((k, embed_dict[k]) for k in dev_ids)
    temp_test_ner = dict((k, embed_dict[k]) for k in test_ids)

    x_train_dict = {}
    x_dev_dict = {}
    x_test_dict = {}

    x_train_dict = get_subvector_data(ner_representation_limit, embed_name, temp_train_ner)
    x_dev_dict = get_subvector_data(ner_representation_limit, embed_name, temp_dev_ner)
    x_test_dict = get_subvector_data(ner_representation_limit, embed_name, temp_test_ner)

    x_train_dict_sorted = collections.OrderedDict(sorted(x_train_dict.items()))
    x_dev_dict_sorted = collections.OrderedDict(sorted(x_dev_dict.items()))
    x_test_dict_sorted = collections.OrderedDict(sorted(x_test_dict.items()))

    # x_train_ner = np.asarray(x_train_dict_sorted.values())
    # x_dev_ner = np.asarray(x_dev_dict_sorted.values())
    # x_test_ner = np.asarray(x_test_dict_sorted.values())

    x_train_ner = np.array(list(x_train_dict_sorted.values())) 
    x_dev_ner = np.array(list(x_dev_dict_sorted.values()))
    x_test_ner = np.array(list(x_test_dict_sorted.values()))

    for each_problem in target_problems:
        print ("Problem type: ", each_problem)
        print ("__________________")
        best_model_name = str(ner_representation_limit)+"-basiccnn1d-"+str(embed_name)+"-"+str(each_problem)+"-"+"best_model.hdf5"
        model = load_model(best_model_name, custom_objects={'_initializer': glorot_uniform()})
        probs, predictions = make_prediction_cnn(model, [x_test_lstm, x_test_ner])
        model_output = get_model_output(each_problem, probs, predictions)
        #results/cnn/GRU-256-concat-los_7-10-new-cnn-.p
        results_file = "results/cnn/" + sequence_model + "-" + str(sequence_hidden_unit) + "-" + embed_name + "-" + each_problem +  "-10-new-cnn-.p"
        printComparison(best_model_name, results_file, model_output)

Embedding:  word2vec
Problem type:  mort_hosp
__________________
--------------------------
modelName: 64-basiccnn1d-word2vec-mort_hosp-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.8756575469204335, 'auprc': 0.5797578721037462, 'acc': 0.9153633854645814, 'F1': 0.4945054945054946}
Problem type:  mort_icu
__________________
--------------------------
modelName: 64-basiccnn1d-word2vec-mort_icu-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.8827769714562167, 'auprc': 0.5091155811671492, 'acc': 0.9413523459061638, 'F1': 0.43207126948775054}
Problem type:  los_3
__________________
--------------------------
modelName: 64-basiccnn1d-word2vec-los_3-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.702856581769842, 'auprc': 0.646645720301383, 'acc': 0.6646734130634775, 'F1': 0.5560292326431181}
Problem type:  los_7
__________________
--------------------------
modelName: 64-basiccnn1d-word2vec-los_7-best_model.hdf5, auc: True, 