# About This Notebook

This is an implementation of BiLSTM-CRF [ 1 ] in Keras. This notebook also downloads a copy of the publically-available CoNLL2003 Dataset to train/validate/test the model.


Sam Rawal, 2018.

1: Lample, Guillaume, et al. "Neural architectures for named entity recognition." arXiv preprint arXiv:1603.01360 (2016).

# Set Up New Machine
## If new Google Colab instance, or installing on new machine.
**After first run, set `setup_machine` to `False`**

In [20]:
import socket
hostname = socket.gethostname()
local_hostnames = {"sam's air", "Sams-Air"}
if hostname[:8] not in local_hostnames:
  local_machine = False
else:
  local_machine = True

In [22]:
setup_machine = True if not local_machine else False

if setup_machine:
  # pip installs for packages not on Google Colab machines by default
  !pip install -q keras==2.2.2
  !pip install git+https://www.github.com/keras-team/keras-contrib.git
  !pip install gensim
  !pip install pydot
  !pip install graphviz
  !apt-get install graphviz
  
  # get CoNLL2003 dataset (Github source: https://goo.gl/SXq3Ct)
  ! mkdir -p datasets/conll2003
  ! curl -o datasets/conll2003/train.txt https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/data/conll2003/en/train.txt
  ! curl -o datasets/conll2003/valid.txt https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/data/conll2003/en/valid.txt
  ! curl -o datasets/conll2003/test.txt https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/data/conll2003/en/test.txt

In [23]:
import os
colab_path = os.environ['colab']+'/' if local_machine else '/gdrive/My Drive/Colab Notebooks/'
project_path = colab_path + "/prescription_extraction/"

In [0]:
if not local_machine:
  # mount Google Drive
  from google.colab import drive
  drive.mount('/gdrive')
  import sys
  sys.path.append(project_path)
  

# Imports

In [0]:
import numpy as np
import sys
import keras
import os

from keras.models import Model, load_model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Reshape, Bidirectional, concatenate, Flatten
from keras_contrib.layers import CRF
from keras.utils import plot_model

from utils.conll_utils import CoNLLParser
from utils import word_embedding_utils as we
from utils import preprocessing_utils as pprc


# BiLSTM-CRF Model

## Configure Model Parameters

In [0]:
embedding_dim = 100
num_epochs = 10
max_sentence = 20
max_word = 25
model_name = 'my_model'

## BiLSTM-CRF Architecture

In [0]:
def define_model(max_sentence=max_sentence, max_word=max_word, vocab_size=None, embedding_dim=embedding_dim,
                                            char_dim=None, ner_dim=None):
  # distributional word representation: embeddings (TODO: GloVe/word2vec)
  word_input = Input(shape=(max_sentence,), name="word_input")
  #embedding_weights = we.get_word2vec_embeddings(embedding_model, word_tokenizer, embedding_dim)
  word = Embedding(input_dim=vocab_size+1, output_dim=embedding_dim,
                   #weights=[embedding_weights],
                   input_length=max_sentence,
                   trainable=False
  )(word_input)

  # orthographic word representation: char embeddings -> BiLSTM
  char_input = Input(shape=(max_sentence, max_word), name="char_input")
  char = TimeDistributed(
      Embedding(input_dim=char_dim, output_dim = 120,
                input_length=max_word)
  )(char_input)
  char = TimeDistributed(
      Bidirectional(
          LSTM(20, return_sequences=True),
          merge_mode='concat'
      )
  )(char)
  char = TimeDistributed(Flatten())(char)

  # concatenate word + char representations
  inputs = concatenate([word, char])

  # main BiLSTM model
  model = Bidirectional(
      LSTM(20, return_sequences=True),
      merge_mode='concat'
  )(inputs)
  model = TimeDistributed(
      Dense(ner_dim, activation='softmax')
  )(model)
  crf = CRF(ner_dim, name="output")
  output = crf(model)

  m = Model(inputs=[word_input, char_input], outputs=output)
  m.compile(
    loss=crf.loss_function,
    optimizer='adam',
    metrics=[crf.accuracy]
)

  return m

## Save Model Architecture Information

In [0]:
def model_info(model, model_name=model_name):
    model.summary()
    if not os.path.exists(project_path+'saved_models/{0}'.format(model_name)):
        os.makedirs(project_path+'saved_models/{0}'.format(model_name))
    plot_model(model,
               to_file=project_path+'saved_models/{0}/{0}_architecture.png'.format(model_name),
               show_shapes=True)

## Train Model

In [0]:
def train_model(model, train_sents, train_chars, train_ner,
                valid_sents=None, valid_chars=None, valid_ner=None, epochs=num_epochs):
    model.fit({"word_input": train_sents, "char_input": train_chars}, {"output": train_ner},
          validation_data=([valid_sents, valid_chars], [valid_ner]),
          epochs=epochs,
    )
    return model

## Evaluate Model

In [0]:
def eval_model(model, test_sents, test_chars, test_ner):
    evaluation = model.evaluate(x=[test_sents, test_chars], y=test_ner)
    return '{0}: {1}'.format(model.metrics_names, evaluation)

## Save Model

In [0]:
# via https://github.com/keras-team/keras-contrib/issues/129#issuecomment-399125152
def create_custom_objects():
    instanceHolder = {"instance": None}
    class ClassWrapper(CRF):
        def __init__(self, *args, **kwargs):
            instanceHolder["instance"] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)
    def loss(*args):
        method = getattr(instanceHolder["instance"], "loss_function")
        return method(*args)
    def accuracy(*args):
        method = getattr(instanceHolder["instance"], "accuracy")
        return method(*args)
    return {"ClassWrapper": ClassWrapper ,"CRF": ClassWrapper, "loss": loss, "accuracy":accuracy}
  
def load_keras_model(path):
    model = load_model(path, custom_objects=create_custom_objects())
    return model


# Load Data

In [0]:
def load_data(d, word_tokenizer=None, ner_tokenizer=None, char_tokenizer=None):
    data = CoNLLParser(d)
    data.pad_data(max_sentence)
    sents = data.get_sentences()
    ner = data.get_ner()
    chars = pprc.split_words(sents, padding=True, pad_len=max_word)
    tokenized_words, word_tokenizer = pprc.tokenize(sents, t=word_tokenizer)
    one_hot_ner, ner_tokenizer = pprc.one_hot_encode(ner, t=ner_tokenizer)
    tokenized_chars, char_tokenizer = pprc.tokenize_chars(chars)
    return (tokenized_words, word_tokenizer), (one_hot_ner, ner_tokenizer), (tokenized_chars, char_tokenizer)

# Run Model

## Preprocess Train/Validation/Test Data

In [0]:
train_data = 'datasets/conll2003/train.txt'
valid_data = 'datasets/conll2003/valid.txt'
test_data = 'datasets/conll2003/test.txt'


(train_sents, word_tokenizer), (train_ner, ner_tokenizer), (train_chars, char_tokenizer) = load_data(train_data)
(valid_sents, _), (valid_ner, _) , (valid_chars, _) = load_data(valid_data, word_tokenizer, ner_tokenizer)
(test_sents, _), (test_ner, _), (test_chars, _) = load_data(test_data, word_tokenizer, ner_tokenizer)
vocab_size = len(word_tokenizer.word_index)

## Load Model From File

In [0]:
load_model = False
if load_model:
  model = load_keras_model(project_path+'saved_models/{0}/model.h5'.format(model_name))

## Get Model Architecture

In [0]:
if not load_model:
  model = define_model(vocab_size=vocab_size, char_dim=len(char_tokenizer), ner_dim=len(ner_tokenizer.word_index))

## Print Model Info and Save Model Architecture

In [0]:
model_info(model)

## Train Model on Train Data & Validation Data

In [0]:
if not load_model:
  model = train_model(model, train_sents, train_chars, train_ner,
                     valid_sents, valid_chars, valid_ner, epochs=2)

## Evaluate Model on Test Data

In [0]:
eval_results = eval_model(model, test_sents, test_chars, test_ner)
print(eval_results)

In [0]:
from datetime import datetime
with open(project_path+'saved_models/{0}/evaluation.txt'.format(model_name), 'a') as df:
  df.write('{0}:\t{1}\n'.format(str(datetime.now()), eval_results))

## Save Model

In [0]:
if not load_model:
  model.save(project_path+'saved_models/{0}/model.h5'.format(model_name))