# Import Data
- pos_cls is the medical data
- neg_cls is the non-medical data

In [None]:
DATA_DIR = './data/'
MEDICAL_CSV = DATA_DIR + 'icd_10_2017.csv'
NON_MEDICAL = DATA_DIR + 'big.txt'
UMLS_CSV = DATA_DIR + 'umls.csv'

In [None]:
import pandas as pd

In [None]:
df_icd = pd.read_csv(MEDICAL_CSV, header=None, usecols=[3, 4])
pos_cls = df_icd[4].tolist()
print("%d lines in pos_cls data." % len(pos_cls))

In [None]:
df_umls = pd.read_csv(UMLS_CSV, sep='***', delimiter='\n\r', header=None)
umls = [_.strip('"') for _ in df_umls[0].tolist()]

In [None]:
pos_cls = umls + pos_cls

In [None]:
with open(NON_MEDICAL, encoding="utf-8") as file:
    neg_cls = [_.strip() for _ in " ".join([l.strip() for l in file]).split(".")]
print("%d lines in neg_cls data." % len(neg_cls))

# Preprocessing

```https://github.com/shams-sam/logic-lab/blob/master/TextPreprocessing/__preprocessing.py```

- using the standard code for preprocessing 

In [None]:
from functools import partial
from preprocessing import text_preprocessing
pre = partial(text_preprocessing, HYPHEN_HANDLE = 2)

In [None]:
pos_cls = [pre(_) for _ in pos_cls]
neg_cls = [pre(_) for _ in neg_cls]
print("%d lines in pos_cls data." % len(pos_cls))
print("%d lines in neg_cls data." % len(neg_cls))

# Data Generation

In [None]:
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
from sklearn.utils import class_weight
from keras.utils import to_categorical

In [None]:
NGRAM = 1

In [None]:
data_tokenizer = Tokenizer()
data_tokenizer.fit_on_texts(pos_cls + neg_cls)

In [None]:
data_index = {v: k for k, v in data_tokenizer.word_index.items()}

In [None]:
pos_seq = data_tokenizer.texts_to_sequences(pos_cls)
neg_seq = data_tokenizer.texts_to_sequences(neg_cls)

In [None]:
padding = [0] * (NGRAM-1)
pos_seq = [padding + _ + padding for _ in pos_seq]
neg_seq = [padding + _ + padding for _ in neg_seq]

In [None]:
X = []
y = []
cls_val = 0
for _ in [neg_seq, pos_seq]:
    for __ in _:
        for idx in range(0, len(__)-NGRAM+1):
            X.append(__[idx: idx+NGRAM])
            y.append(cls_val)
    cls_val += 1
assert len(X) == len(y)
num_pos_cls = len([_ for _ in y if _ == 1])
num_neg_cls = len([_ for _ in y if _ == 0])
assert num_pos_cls + num_neg_cls == len(y)

In [None]:
print("%d training data available." % len(X))
print("%d positive data available." % num_pos_cls)
print("%d negative data available." % num_neg_cls)

In [None]:
X = np.array(X)
y = np.array(y)
print("shape X: %d rows, %d columns" % X.shape)
print("shape y: %d rows" % y.shape)

In [None]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
print('number of classes:', len(class_weights))

In [None]:
shuffle_split = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
shuffle_split.get_n_splits(X, y)
for train_index, test_index in shuffle_split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)
print("%d in train set." % len(y_train))
print("%d in test set." % len(y_test))
weight_val = np.ones(len(y_test))
for i in range(len(y_test)):
    weight_val[i] *= class_weights[y_test[i]-1]

In [None]:
y_train = to_categorical(y_train, num_classes=len(class_weights))
y_test = to_categorical(y_test, num_classes=len(class_weights))

In [None]:
print("shape X_train: %d rows, %d columns " % X_train.shape)
print("shape y_train: %d rows, %d columns" % y_train.shape)
print("shape X_test: %d rows, %d columns " % X_test.shape)
print("shape y_test: %d rows, %d columns" % y_test.shape)

# Word2Vec and Embedding Matrix

In [None]:
import gensim
import numpy as np

In [None]:
WORD2VEC_MODEL = '/data/Discharge_Summary/Diagnosis_ICD/master/wikipedia-pubmed-and-PMC-w2v.bin'
EMBEDDING_DIM = 200

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)
def embedding_index(word):
    return w2v_model.word_vec(word)

In [None]:
nb_words = len(data_tokenizer.word_index)+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in data_tokenizer.word_index.items():
    if word in w2v_model.vocab:
        embedding_matrix[i] = embedding_index(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

# Model

In [None]:
num_lstm = 234
num_dense = 142
rate_drop_lstm = 0.21
rate_drop_dense = 0.24
act = 'relu'

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
import datetime
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=NGRAM,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_input = Input(shape=(NGRAM,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = lstm_layer(embedded_sequences)
x = BatchNormalization()(x)
x = Dropout(rate_drop_dense)(x)

x = Dense(num_dense, activation=act)(x)
x = BatchNormalization()(x)
x = Dropout(rate_drop_dense)(x)

preds = Dense(len(class_weights), activation='softmax')(x)

In [None]:
model = Model(inputs=[sequence_input], \
        outputs=preds)
model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['acc'])

model.summary()

In [None]:
time = datetime.datetime.now().strftime('D%Y%m%d_T%H%M')
STAMP = 'unigram_model_' + str(time) +  '_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)
print(STAMP)

In [None]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

try:
    hist = model.fit([X_train], y_train, \
        validation_data=([X_test], y_test, weight_val), \
        epochs=10, batch_size=2048, shuffle=True, \
        class_weight=class_weights, callbacks=[early_stopping, model_checkpoint])
except:
    print("\n\nTraining Stopped Manually.")

In [None]:
hist.history

# Model Testing

In [None]:
data_index[0] = '***'

In [None]:
def get_prediction(sentence, verbose = False):
    sentence = text_preprocessing(sentence)
    seq = data_tokenizer.texts_to_sequences([sentence])
    seq = seq[0]
    result = []
    insert_end = False
    insert_start = True
    for idx in range(0, len(seq)):
        category = model.predict(np.atleast_2d([seq[idx]]))
        cat = category.argmax()
        print(data_index[seq[idx]], cat)

In [None]:
get_prediction('patient has type 2 diabetes mellitus and is observed to display symptoms of AIDS.')

# Saving Model

In [None]:
import _pickle as pkl
pkl.dump(data_tokenizer, open('unigram_data_tokenizer.pkl', 'wb'))