# Keras LSTM Model

In [None]:
DATA_PATH = 'CDR_Data/BC5CDR-disease/'

In [None]:
import os

import numpy as np
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
def read_tsv(path):
    result = []
    with open(path) as fp:
        for sent in fp.read().split('\n\n'):
            sent_toks = []
            for tok in sent.strip().split('\n'):
                tok = tok.strip()
                if len(tok) == 0:
                    continue
                tok, tag = tok.split('\t')
                sent_toks.append((tok, tag))
            result.append(sent_toks)
    return result

In [None]:
train_sents = read_tsv(os.path.join(DATA_PATH, 'train.tsv'))
valid_sents = read_tsv(os.path.join(DATA_PATH, 'devel.tsv'))
test_sents = read_tsv(os.path.join(DATA_PATH, 'test.tsv'))

In [None]:
train_sents[0]

## Features

Next, define some features. In this example we use word identity, word suffix, and word shape; also, some information from nearby words is used.

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [None]:
def get_dict_map(data, token_or_tag, start=0):
    tok2idx = {}
    idx2tok = {}
    if token_or_tag == 'token':
        vocab = list(set([tok for sent in data for tok, tag in sent]))
    else:
        vocab = list(set([tag for sent in data for tok, tag in sent]))
    idx2tok = {idx + start:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx + start for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

corpus = train_sents + valid_sents + test_sents
token2idx, idx2token = get_dict_map(corpus, 'token', start=1)
tag2idx, idx2tag = get_dict_map(corpus, 'tag')

This is what token2idx and tag2idx extracts:

In [None]:
print('Vocabulary Size: {}'.format(len(idx2token)))
print('Label Info: ', tag2idx)

In [None]:
def sent2features(s, key2idx):
    return list([key2idx[tok] for tok, tag in s])

def sent2labels(s, key2idx):
    return list([key2idx[tag] for tok, tag in s])

Extract features from the data:

In [None]:
%%time
X_train = [sent2features(s, token2idx) for s in train_sents]
y_train = [sent2labels(s, tag2idx) for s in train_sents]

X_valid = [sent2features(s, token2idx) for s in valid_sents]
y_valid = [sent2labels(s, tag2idx) for s in valid_sents]

X_test = [sent2features(s, token2idx) for s in test_sents]
y_test = [sent2labels(s, tag2idx) for s in test_sents]

In [None]:
def preprocess(tokens, tags, maxlen=None):
    if maxlen is None:
        maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='pre', value=0)
    #Pad Tags (y var) and convert it into one hot encoding
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2idx['O'])
    n_tags = len(tag2idx)
    pad_tags = np.array([to_categorical(i, num_classes=n_tags) for i in pad_tags])
    return pad_tokens, pad_tags, maxlen

X_train_pad, y_train_pad, maxlen = preprocess(X_train, y_train)
X_valid_pad, y_valid_pad, _ = preprocess(X_valid, y_valid, maxlen)
X_test_pad, y_test_pad, _ = preprocess(X_test, y_test, maxlen)

In [None]:
print(X_train_pad.shape, y_train_pad.shape)
print(X_valid_pad.shape, y_valid_pad.shape)
print(X_test_pad.shape, y_test_pad.shape)

## Training

To see all possible CRF parameters check its docstring. 
Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [None]:
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [None]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [None]:
input_dim = len(token2idx) + 1
output_dim = 64
input_length = X_train_pad.shape[1]
n_tags = len(tag2idx)

def get_bilstm_lstm_model():
    model = Sequential()
    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))
    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

In [None]:
%%time
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss
# construct the model
model_bilstm_lstm = get_bilstm_lstm_model()
# plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(X_train_pad, np.array(y_train_pad), model_bilstm_lstm)

## Evaluation

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels

In [None]:
y_pred = crf.predict(X_valid)
metrics.flat_f1_score(y_valid, y_pred, average='weighted', labels=labels)

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=sorted_labels, digits=3
))