# Import Data
- pos_cls is the medical data
- neg_cls is the non-medical data

In [1]:
DATA_DIR = './data/'
MEDICAL_CSV = DATA_DIR + 'icd_10_2017.csv'
NON_MEDICAL = DATA_DIR + 'big.txt'

In [2]:
import pandas as pd

In [3]:
df_icd = pd.read_csv(MEDICAL_CSV, header=None, usecols=[3, 4])
pos_cls = df_icd[4].tolist()
print("%d lines in pos_cls data." % len(pos_cls))

93830 lines in pos_cls data.


In [4]:
with open(NON_MEDICAL, encoding="utf-8") as file:
    neg_cls = [_.strip() for _ in " ".join([l.strip() for l in file]).split(".")]
print("%d lines in neg_cls data." % len(neg_cls))

58670 lines in neg_cls data.


# Preprocessing

```https://github.com/shams-sam/logic-lab/blob/master/TextPreprocessing/__preprocessing.py```

- using the standard code for preprocessing 

In [5]:
from functools import partial
from preprocessing import text_preprocessing
pre = partial(text_preprocessing, HYPHEN_HANDLE = 2)

In [6]:
pos_cls = [pre(_) for _ in pos_cls]
neg_cls = [pre(_) for _ in neg_cls]
print("%d lines in pos_cls data." % len(pos_cls))
print("%d lines in neg_cls data." % len(neg_cls))

93830 lines in pos_cls data.
58670 lines in neg_cls data.


# Data Generation

In [9]:
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.utils import class_weight
from keras.utils import to_categorical

In [10]:
NGRAM = 2

In [11]:
data_tokenizer = Tokenizer()
data_tokenizer.fit_on_texts(pos_cls + neg_cls)

In [12]:
data_index = {v: k for k, v in data_tokenizer.word_index.items()}

In [13]:
pos_seq = data_tokenizer.texts_to_sequences(pos_cls)
neg_seq = data_tokenizer.texts_to_sequences(neg_cls)

In [14]:
padding = [0] * (NGRAM-1)
pos_seq = [padding + _ + padding for _ in pos_seq]
neg_seq = [padding + _ + padding for _ in neg_seq]

In [15]:
X = []
y = []
cls_val = 0
for _ in [neg_seq, pos_seq]:
    for __ in _:
        for idx in range(0, len(__)-NGRAM+1):
            X.append(__[idx: idx+NGRAM])
            y.append(cls_val)
    cls_val += 1
assert len(X) == len(y)
num_pos_cls = len([_ for _ in y if _ == 1])
num_neg_cls = len([_ for _ in y if _ == 0])
assert num_pos_cls + num_neg_cls == len(y)

In [16]:
print("%d training data available." % len(X))
print("%d positive data available." % num_pos_cls)
print("%d negative data available." % num_neg_cls)

2197293 training data available.
1027741 positive data available.
1169552 negative data available.


In [17]:
X = np.array(X)
y = np.array(y)
print("shape X: %d rows, %d columns" % X.shape)
print("shape y: %d rows" % y.shape)

shape X: 2197293 rows, 2 columns
shape y: 2197293 rows


In [18]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
print('number of classes:', len(class_weights))

number of classes: 2


In [19]:
shuffle_split = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
shuffle_split.get_n_splits(X, y)
for train_index, test_index in shuffle_split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)
print("%d in train set." % len(y_train))
print("%d in test set." % len(y_test))
weight_val = np.ones(len(y_test))
for i in range(len(y_test)):
    weight_val[i] *= class_weights[y_test[i]-1]

1538105 in train set.
659188 in test set.


In [20]:
y_train = to_categorical(y_train, num_classes=len(class_weights))
y_test = to_categorical(y_test, num_classes=len(class_weights))

In [21]:
print("shape X_train: %d rows, %d columns " % X_train.shape)
print("shape y_train: %d rows, %d columns" % y_train.shape)
print("shape X_test: %d rows, %d columns " % X_test.shape)
print("shape y_test: %d rows, %d columns" % y_test.shape)

shape X_train: 1538105 rows, 2 columns 
shape y_train: 1538105 rows, 2 columns
shape X_test: 659188 rows, 2 columns 
shape y_test: 659188 rows, 2 columns


# Word2Vec and Embedding Matrix

In [22]:
import gensim
import numpy as np

In [23]:
WORD2VEC_MODEL = '/data/Discharge_Summary/Diagnosis_ICD/master/wikipedia-pubmed-and-PMC-w2v.bin'
EMBEDDING_DIM = 200

In [24]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)
def embedding_index(word):
    return w2v_model.word_vec(word)

In [25]:
nb_words = len(data_tokenizer.word_index)+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in data_tokenizer.word_index.items():
    if word in w2v_model.vocab:
        embedding_matrix[i] = embedding_index(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 2724


# Model

In [26]:
num_lstm = 234
num_dense = 142
rate_drop_lstm = 0.21
rate_drop_dense = 0.24
act = 'relu'

In [27]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
import datetime
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [28]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=NGRAM,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_input = Input(shape=(NGRAM,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = lstm_layer(embedded_sequences)
x = BatchNormalization()(x)
x = Dropout(rate_drop_dense)(x)

x = Dense(num_dense, activation=act)(x)
x = BatchNormalization()(x)
x = Dropout(rate_drop_dense)(x)

preds = Dense(len(class_weights), activation='softmax')(x)

In [29]:
model = Model(inputs=[sequence_input], \
        outputs=preds)
model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2, 200)            6795000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 234)               407160    
_________________________________________________________________
batch_normalization_1 (Batch (None, 234)               936       
_________________________________________________________________
dropout_1 (Dropout)          (None, 234)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 142)               33370     
_________________________________________________________________
batch_normalization_2 (Batch (None, 142)               568       
__________

In [30]:
time = datetime.datetime.now().strftime('D%Y%m%d_T%H%M')
STAMP = 'lstm_model_' + str(time) +  '_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)
print(STAMP)

lstm_model_D20171216_T1851_234_142_0.21_0.24


In [None]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

try:
    hist = model.fit([X_train], y_train, \
        validation_data=([X_test], y_test, weight_val), \
        epochs=10, batch_size=2048, shuffle=True, \
        class_weight=class_weights, callbacks=[early_stopping, model_checkpoint])
except:
    print("Training Stopped Manually.")

Train on 1538105 samples, validate on 659188 samples
Epoch 1/10
Epoch 2/10
  75776/1538105 [>.............................] - ETA: 4:11 - loss: 0.1265 - acc: 0.9526

In [None]:
hist.history

# Model Testing

In [None]:
def get_prediction_bigram(seq, verbose = False):
    category = model.predict([seq])
    cat = category.argmax()
    if verbose:
        print(category.argsort()[0][::-1])
        print(category[0][category.argsort()[0][::-1]])
    return cat

In [None]:
data_tokenizer.word_index['colonel']

In [None]:
get_prediction_bigram(np.atleast_2d([273, 1378]))