In [3]:
import pandas as pd
import numpy as np
from pickle import load
import pickle

### Training Data

In [4]:
train_data = open('train_data.pkl', 'rb')
train_dict = load(train_data)
train_data.close()

# print(train_dict.keys())

In [33]:
# ctr = 0
# for key in train_dict.keys():
#     for hadm in train_dict[key].keys():
#         ctr += 1
        
        
# print(ctr)

### Create list of diseases and the corresponding procedures

In [5]:
disease_list = list()
procedure_list = list()
for key in train_dict.keys():
    for hadm in train_dict[key].keys():
        disease_list.append(' '.join(train_dict[key][hadm][0]))
        procedure_list.append(' '.join(str(e) for e in train_dict[key][hadm][1]))
        

Add start and stop tokens

In [6]:
for i in range(len(disease_list)):
    disease_list[i] = '0 ' + disease_list[i] + ' 1'

In [7]:
for i in range(len(procedure_list)):
    procedure_list[i] = '0 ' + procedure_list[i] + ' 1'

In [8]:
# # trying to print procedure as a string of sequences
# for key in train_dict.keys():
#     for hadm in train_dict[key].keys():
# #         print(' '.join(str(i) for i in train_dict[key][hadm][1]))
#         print(' '.join(train_dict[key][hadm][0]))

test_disease = disease_list[10001]
test_proc = procedure_list[10001]

eval_diseases = disease_list[1000:2000]
eval_proc = procedure_list[1000:2000]

disease_list= disease_list[:1000]
procedure_list = procedure_list[:1000]

# print(disease_list[:100])

In [9]:
test_disease

'0 431 42732 2761 5990 3320 4019 1'

Prepare the padded sequences of disease strings

In [10]:
# prepare the tokeniser for diseases
from keras.preprocessing.text import Tokenizer

disease_tokeniser = Tokenizer()
disease_tokeniser.fit_on_texts(disease_list)




  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [122]:
# print(disease_tokeniser.word_index)

In [11]:
# prepare the tokeniser for procedures

proc_tokeniser = Tokenizer()
proc_tokeniser.fit_on_texts(procedure_list)

In [12]:
dis_len = len(disease_tokeniser.word_index) + 1
proc_len = len(proc_tokeniser.word_index) + 1
print(dis_len)
print(proc_len)

1581
506


Create the sequences on which we are supposed to train

In [13]:
from numpy import array
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

def create_sequences(disease_tokeniser, proc_tokeniser, max_length_diseases, max_length_procedures, diseases, procedures, dis_len, proc_len):
    X1, X2, y = list(), list(), list()
    for i in range(len(diseases)):
        dis_seq = disease_tokeniser.texts_to_sequences([diseases[i]])[0]
        dis_seq = pad_sequences([dis_seq], maxlen=max_length_diseases, padding='post')
        proc_seq = proc_tokeniser.texts_to_sequences([procedures[i]])[0]
        for j in range(len(proc_seq)):
            in_seq, out_seq = proc_seq[:j], proc_seq[j]
            
            in_seq = pad_sequences([in_seq], maxlen = max_length_procedures)
            
            out_seq = to_categorical([out_seq], num_classes=proc_len)[0]
            
            X1.append(dis_seq)
            X2.append(in_seq)
            y.append(out_seq)
            
    return array(X1), array(X2), array(y)

In [14]:
X1train, X2train, ytrain = create_sequences(disease_tokeniser, proc_tokeniser, 50, 50, disease_list, procedure_list, dis_len, proc_len)

In [15]:
train_X1 = X1train.reshape(X1train.shape[0], X1train.shape[2])
train_X2 = X2train.reshape(X2train.shape[0], X2train.shape[2])
# ytrain = ytrain.reshape((-1, 1))
print(ytrain.shape)

(6717, 506)


In [16]:
# for i in range(len(train_X2)):
#     for j in range(len(train_X2[i])):
#         if (j >= 168):
#             print(train_X2[i])
from keras.metrics import top_k_categorical_accuracy

top20 = lambda x, y: top_k_categorical_accuracy(x, y, k=10)

Precision metric

In [17]:
import keras.backend as K

def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

# Define Model

In [18]:
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras import metrics
from keras.metrics import top_k_categorical_accuracy

def top_acc(x, y):
    return top_k_categorical_accuracy(x, y, k=10)

def define_model(max_length, dis_len, proc_len):
    
    inputs1 = Input(shape=(50, ))
    ae1 = Embedding(dis_len, 256, mask_zero=True)(inputs1)
    ae2 = Dropout(0.5)(ae1)
    ae3 = LSTM(256)(ae1)
    
    inputs2 = Input(shape=(50,))
    be1 = Embedding(proc_len, 256, mask_zero=True)(inputs2)
    be2 = Dropout(0.5)(be1)
    be3 = LSTM(256)(be1)
    
    decoder1 = add([ae3, be3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    decoder3 = Dropout(0.5)(decoder2)
    print('decoder 3 shape = ' + str(decoder3.shape))
    outputs = Dense(proc_len, activation='softmax')(decoder2)
    print('outputs shape = ' + str(outputs.shape))
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[top_acc, precision])
    print(model.summary())
    return model

In [19]:
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

Training

In [21]:
model = define_model(50, dis_len, proc_len)

model.fit([train_X1, train_X2], ytrain, epochs=20, verbose=1, callbacks=[checkpoint], validation_split=0.1, shuffle=True)

decoder 3 shape = (?, 256)
outputs shape = (?, 506)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 50, 256)      404736      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 50, 256)      129536      input_4[0][0]                    
_________________________________________________________

KeyboardInterrupt: 

In [33]:
model.save('demo.h5')

In [21]:
from keras.models import load_model

model = load_model('demo.h5', custom_objects={'top_acc' : top_acc, 'precision': precision})

# Generate Output 

In [22]:
from numpy import argmax

def code_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_sequences(disease_tokenizer, proc_tokenizer, max_length_diseases, max_length_proc, diseases, dis_len, proc_len):
    in_seq = '0'
    dis_seq = disease_tokeniser.texts_to_sequences([diseases])[0]
    dis_seq = pad_sequences([dis_seq], maxlen=max_length_diseases, padding='post')
    for i in range(max_length_proc):
        sequence = proc_tokenizer.texts_to_sequences([in_seq])[0]
        
        # pad input 
        sequence = pad_sequences([sequence], maxlen=max_length_proc)
        
        yhat = model.predict([dis_seq, sequence], verbose=1)
        
        yhat = argmax(yhat)
        
        code = code_for_id(yhat, proc_tokenizer)
        
        if code is None:
            break
            
        in_seq += ' ' + code
        
        if code == '1':
            break
            
    return in_seq

In [23]:
# diseases = '1983 431 1623 486 4019 V1582 78321 2559'

code_seq = generate_sequences(disease_tokeniser, proc_tokeniser, 50, 50, test_disease, dis_len, proc_len)



In [24]:
print(code_seq)
print(test_proc)

0 9604 9604 1
0 4432 966 1


# Model Evaluation

In [25]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_model(disease_tokeniser, proc_tokeniser, eval_diseases, eval_proc):
    actual, predicted = list(), list()
    
    for i in range(len(eval_diseases)):
        yhat = generate_sequences(disease_tokeniser, proc_tokeniser, 50, 50, eval_diseases[i], dis_len, proc_len)
        
        references = [d.split() for d in eval_proc[i]]
        
        actual.append(references)
        predicted.append(yhat.split())
        
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [26]:
evaluate_model(disease_tokeniser, proc_tokeniser, eval_diseases, eval_proc)

BLEU-1: 0.500000
BLEU-2: 0.707107
BLEU-3: 0.812252
BLEU-4: 0.840896
BLEU-1: 0.500000
BLEU-2: 0.707107
BLEU-3: 0.812252
BLEU-4: 0.840896


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.600000
BLEU-2: 0.774597
BLEU-3: 0.857917
BLEU-4: 0.880112
BLEU-1: 0.615385
BLEU-2: 0.784465
BLEU-3: 0.864458
BLEU-4: 0.885700
BLEU-1: 0.625000
BLEU-2: 0.790569
BLEU-3: 0.868488
BLEU-4: 0.889140
BLEU-1: 0.600000
BLEU-2: 0.774597
BLEU-3: 0.857917
BLEU-4: 0.880112
BLEU-1: 0.583333
BLEU-2: 0.763763
BLEU-3: 0.850697
BLEU-4: 0.873935
BLEU-1: 0.592593
BLEU-2: 0.769800
BLEU-3: 0.854726
BLEU-4: 0.877383
BLEU-1: 0.580645
BLEU-2: 0.762001
BLEU-3: 0.849519
BLEU-4: 0.872927
BLEU-1: 0.571429
BLEU-2: 0.755929
BLEU-3: 0.845451
BLEU-4: 0.869442
BLEU-1: 0.578947
BLEU-2: 0.760886
BLEU-3: 0.848773
BLEU-4: 0.872288
BLEU-1: 0.571429
BLEU-2: 0.755929
BLEU-3: 0.845451
BLEU-4: 0.869442
BLEU-1: 0.565217
BLEU-2: 0.751809
BLEU-3: 0.842684
BLEU-4: 0.867069
BLEU-1: 0.571429
BLEU-2: 0.755929
BLEU-3: 0.845451
BLEU-4: 0.869442
BLEU-1: 0.566038
BLEU-2: 0.752355
BLEU-3: 0.843051
BLEU-4: 0.867384
BLEU-1: 0.561404
BLEU-2: 0.749269
BLEU-3: 0.840974
BLEU-4: 0.865603
BLEU-1: 0.557377
BLEU-2: 0.746577
BLEU-3: 0.8391

BLEU-1: 0.560510
BLEU-2: 0.748672
BLEU-3: 0.840572
BLEU-4: 0.865258
BLEU-1: 0.562500
BLEU-2: 0.750000
BLEU-3: 0.841466
BLEU-4: 0.866025
BLEU-1: 0.564417
BLEU-2: 0.751277
BLEU-3: 0.842326
BLEU-4: 0.866762
BLEU-1: 0.562874
BLEU-2: 0.750249
BLEU-3: 0.841634
BLEU-4: 0.866169
BLEU-1: 0.561404
BLEU-2: 0.749269
BLEU-3: 0.840974
BLEU-4: 0.865603
BLEU-1: 0.560000
BLEU-2: 0.748331
BLEU-3: 0.840343
BLEU-4: 0.865062
BLEU-1: 0.558659
BLEU-2: 0.747435
BLEU-3: 0.839739
BLEU-4: 0.864543
BLEU-1: 0.557377
BLEU-2: 0.746577
BLEU-3: 0.839160
BLEU-4: 0.864047
BLEU-1: 0.556150
BLEU-2: 0.745754
BLEU-3: 0.838605
BLEU-4: 0.863571
BLEU-1: 0.557895
BLEU-2: 0.746924
BLEU-3: 0.839394
BLEU-4: 0.864247
BLEU-1: 0.559585
BLEU-2: 0.748054
BLEU-3: 0.840156
BLEU-4: 0.864901
BLEU-1: 0.558376
BLEU-2: 0.747245
BLEU-3: 0.839611
BLEU-4: 0.864434
BLEU-1: 0.560000
BLEU-2: 0.748331
BLEU-3: 0.840343
BLEU-4: 0.865062
BLEU-1: 0.558824
BLEU-2: 0.747545
BLEU-3: 0.839813
BLEU-4: 0.864607
BLEU-1: 0.560386
BLEU-2: 0.748590
BLEU-3: 0.8405

BLEU-1: 0.547855
BLEU-2: 0.740172
BLEU-3: 0.834833
BLEU-4: 0.860333
BLEU-1: 0.547231
BLEU-2: 0.739751
BLEU-3: 0.834548
BLEU-4: 0.860088
BLEU-1: 0.546624
BLEU-2: 0.739340
BLEU-3: 0.834270
BLEU-4: 0.859849
BLEU-1: 0.546032
BLEU-2: 0.738940
BLEU-3: 0.833999
BLEU-4: 0.859616
BLEU-1: 0.547170
BLEU-2: 0.739709
BLEU-3: 0.834520
BLEU-4: 0.860064
BLEU-1: 0.546584
BLEU-2: 0.739313
BLEU-3: 0.834252
BLEU-4: 0.859833
BLEU-1: 0.547692
BLEU-2: 0.740062
BLEU-3: 0.834759
BLEU-4: 0.860269
BLEU-1: 0.548780
BLEU-2: 0.740797
BLEU-3: 0.835256
BLEU-4: 0.860696
BLEU-1: 0.549849
BLEU-2: 0.741518
BLEU-3: 0.835744
BLEU-4: 0.861114
BLEU-1: 0.550898
BLEU-2: 0.742225
BLEU-3: 0.836222
BLEU-4: 0.861525
BLEU-1: 0.550296
BLEU-2: 0.741819
BLEU-3: 0.835947
BLEU-4: 0.861289
BLEU-1: 0.551320
BLEU-2: 0.742509
BLEU-3: 0.836414


KeyboardInterrupt: 