In [7]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe

# import glove
# from glove import Corpus

import collections
import gc 

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Dropout, Input, concatenate, merge, Activation, Concatenate, LSTM, GRU
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Conv1D, BatchNormalization, GRU, Convolution1D, LSTM
from keras.layers import UpSampling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPool1D, merge

from keras.optimizers import Adam


from keras.utils import np_utils
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
import tensorflow as tf

from keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau


from sklearn.utils import class_weight
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')


from keras.models import load_model
from keras.utils import CustomObjectScope
from keras.initializers import glorot_uniform

In [8]:
## Load Data ##

type_of_ner = "new"

x_train_lstm = pd.read_pickle("data/"+type_of_ner+"_x_train.pkl")
x_dev_lstm = pd.read_pickle("data/"+type_of_ner+"_x_dev.pkl")
x_test_lstm = pd.read_pickle("data/"+type_of_ner+"_x_test.pkl")

y_train = pd.read_pickle("data/"+type_of_ner+"_y_train.pkl")
y_dev = pd.read_pickle("data/"+type_of_ner+"_y_dev.pkl")
y_test = pd.read_pickle("data/"+type_of_ner+"_y_test.pkl")

ner_word2vec = pd.read_pickle("data/"+type_of_ner+"_ner_word2vec_limited_dict.pkl")
ner_fasttext = pd.read_pickle("data/"+type_of_ner+"_ner_fasttext_limited_dict.pkl")
ner_concat = pd.read_pickle("data/"+type_of_ner+"_ner_combined_limited_dict.pkl")

train_ids = pd.read_pickle("data/"+type_of_ner+"_train_ids.pkl")
dev_ids = pd.read_pickle("data/"+type_of_ner+"_dev_ids.pkl")
test_ids = pd.read_pickle("data/"+type_of_ner+"_test_ids.pkl")

In [9]:
timeseries_model_results = [{"modelName":"GRU-256-los_3-best_model.hdf5", "results":"results/256-GRU-los_3-10-new.p", "problem":"los_3"},{"modelName":"GRU-256-los_7-best_model.hdf5", "results":"results/256-GRU-los_7-10-new.p", "problem":"los_7"},{"modelName":"GRU-256-mort_hosp-best_model.hdf5", "results":"results/256-GRU-mort_hosp-10-new.p", "problem":"mort_hosp"},{"modelName":"GRU-256-mort_icu-best_model.hdf5", "results":"results/256-GRU-mort_icu-10-new.p", "problem":"mort_icu"},{"modelName":"LSTM-256-los_3-best_model.hdf5", "results":"results/256-LSTM-los_3-10-new.p", "problem":"los_3"},{"modelName":"LSTM-256-los_7-best_model.hdf5", "results":"results/256-LSTM-los_7-10-new.p", "problem":"los_7"},{"modelName":"LSTM-256-mort_hosp-best_model.hdf5", "results":"results/256-LSTM-mort_hosp-10-new.p", "problem":"mort_hosp"},{"modelName":"LSTM-256-mort_icu-best_model.hdf5", "results":"results/256-LSTM-mort_icu-10-new.p", "problem":"mort_icu"}]

In [10]:
def create_dataset(dict_of_ner):
    temp_data = []
    for k, v in sorted(dict_of_ner.items()):
        temp = []
        for embed in v:
            temp.append(embed)
        temp_data.append(np.mean(temp, axis = 0)) 
    return np.asarray(temp_data)
    
def make_prediction_multi_avg(model, test_data):
    probs = model.predict(test_data)
    y_pred = [1 if i>=0.5 else 0 for i in probs]
    return probs, y_pred

def printComparison(modelName, resultsFile, model_output):
    stats = pd.read_pickle(resultsFile)
    print("--------------------------")
    print(f"modelName: {modelName}, auc: {model_output['auc']==stats['auc']}, auprc: {model_output['auprc']==stats['auprc']}, acc: {model_output['acc']==stats['acc']} F1: {model_output['F1']==stats['F1']}")
    print(stats)

def get_model_output(problem, probs, predictions):
    ground_truth = y_test[problem].values
    auc = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    model_output = {}    
    model_output['auc'] = auc
    model_output['auprc'] = auprc
    model_output['acc'] = acc
    model_output['F1'] = F1
    return model_output
    

In [11]:
embedding_types = ['word2vec', 'fasttext', 'concat']

embedding_dict = [ner_word2vec, ner_fasttext, ner_concat]

target_problems = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']


unit_sizes = [256]

layers = ["GRU"]
for each_layer in layers:
    for each_unit_size in unit_sizes:
        for embed_dict, embed_name in zip(embedding_dict, embedding_types):    
            print("=============================")
            print ("Embedding: ", embed_name)

            temp_train_ner = dict((k, embed_dict[k]) for k in train_ids)
            temp_dev_ner = dict((k, embed_dict[k]) for k in dev_ids)
            temp_test_ner = dict((k, embed_dict[k]) for k in test_ids)

            x_train_ner = create_dataset(temp_train_ner)
            x_dev_ner = create_dataset(temp_dev_ner)
            x_test_ner = create_dataset(temp_test_ner)
            for each_problem in target_problems:
                best_model_name = "avg-"+str(embed_name)+"-"+str(each_problem)+"-"+"best_model.hdf5"
                model = load_model(best_model_name, custom_objects={'_initializer': glorot_uniform()})
                probs, predictions = make_prediction_multi_avg(model, [x_test_lstm, x_test_ner])
                model_output = get_model_output(each_problem, probs, predictions)
                #results/GRU-256-concat-los_3-1-new-avg-.p
                results_file = "results/" + each_layer + "-" + str(each_unit_size) + "-" + embed_name + "-" + each_problem +  "-1-new-avg-.p"
                printComparison(best_model_name, results_file, model_output)

Embedding:  word2vec
--------------------------
modelName: avg-word2vec-mort_hosp-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.8753006872852235, 'auprc': 0.5798278009766124, 'acc': 0.9162833486660533, 'F1': 0.47246376811594204}
--------------------------
modelName: avg-word2vec-mort_icu-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.8878037361299685, 'auprc': 0.5265239122471965, 'acc': 0.9429622815087396, 'F1': 0.456140350877193}
--------------------------
modelName: avg-word2vec-los_3-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.7013270793679567, 'auprc': 0.6409345090590508, 'acc': 0.6688132474701012, 'F1': 0.54858934169279}
--------------------------
modelName: avg-word2vec-los_7-best_model.hdf5, auc: True, auprc: True, acc: True F1: True
{'auc': 0.7272184819076354, 'auprc': 0.2201873280088422, 'acc': 0.9176632934682613, 'F1': 0.01648351648351648}
Embedding:  fasttext
--------------------------
modelName: avg-fa