Here we will train an NER model identical to the one described in our publication. This step assumes that you already have a trained tokeniser, though if you don't want to use our trainable tokenisers, it's quite trivial to use another one, because API-wise a tokeniser is simply a function of form `(Text) -> List[Interval[Text]]`. 

In [None]:
from typing import Callable, Union, Iterable, Sequence
from functools import reduce
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'  # you might want to change this

import numpy as np
import pandas as pd
import joblib
from fn import F
from sklearn import metrics
from keras import layers, models
from keras import backend as K
from keras.models import Model

from scilk.corpora import corpus, chemdner
from scilk.util import preprocessing, intervals, binning
from scilk.collections import common
from scilk.collections.chemdner import loaders
from scilk.util.networks import blocks, wrappers, callbacks

#### 0. Before we start, let's load the prerequisites: a trained tokeniser and token encoders.

In [None]:
root = 'chemdner-collection'  # our recurring destination root

tokeniser_data = {
    'tokeniser_weights': f'{root}/tokeniser-weights.hdf5',
    'charmap': f'{root}/charmap.joblib'
}
tokeniser = loaders.load_tokeniser(tokeniser_data)

# word embeddings pretrained in Glove
word_transform = lambda word: '<NUM>' if word.isnumeric() else word
glove_embeddings = common.read_glove(f'{root}/vectors.txt.gz')
wordencoder = common.build_wordencoder(glove_embeddings, word_transform)


# we reuse the character set from tokeniser_data['charmap'] here
wordlen = 32  # specify the maximum number of characters to keep in a word
maxchar, charmap, charencoder = common.build_charencoder(joblib.load(tokeniser_data['charmap']), wordlen)

So, here are the major steps:

1. Read the corpus (texts and annotations) and tokenise texts;
2. Encode tokens and annotations;
3. Separate datasets for training and validation;
4. Binpack and chunk the datasets;
5. Build and train the network, save the results.

#### 1. Read the corpus (texts and annotations) and tokenise texts

In [None]:
chemnder_train = corpus.flatten_abstracts(
    chemdner.parse('data/chemdner_corpus/training.abstracts.txt',
                   'data/chemdner_corpus/training.annotations.txt')
)
chemnder_dev = corpus.flatten_abstracts(
    chemdner.parse('data/chemdner_corpus/development.abstracts.txt',
                   'data/chemdner_corpus/development.annotations.txt')
)
texts, annotations, _ = zip(*chemnder_train+chemnder_dev)
tokenised = tokeniser(list(texts))

#### 2. Encode tokens and annotations

In [None]:
# since our model has two separate output nodes with labels for the entity starts and parts, 
# we need to encode these separately


def encode_parts(timesteps: Sequence[intervals.Interval], 
                 annotations: Sequence[intervals.Interval[str]]) -> np.ndarray:
    """
    !NOTE! it returns a sum, letting one trace overlapping entities
    """
    encoded_points = np.zeros(max(step.stop for step in timesteps), dtype=np.int32)
    for iv in annotations:
        encoded_points[iv.start:iv.stop] = 1
    return np.array([encoded_points[step.start:step.stop].sum() for step in timesteps], dtype=np.int32)


def encode_starts(timesteps: Sequence[intervals.Interval], 
                  annotations: Sequence[intervals.Interval[str]]) -> np.ndarray:
    """
    !NOTE! it returns a sum, letting one trace overlapping entities
    """
    encoded_points = np.zeros(max(step.stop for step in timesteps), dtype=np.int32)
    encoded_points[[iv.start for iv in annotations]] = 1
    return np.array([encoded_points[step.start:step.stop].sum() for step in timesteps], dtype=np.int32)

In [None]:
def process_pair(tokens, annotations):
    strings = list(intervals.unload(tokens))
    charemb = charencoder(strings)
    wordemb = wordencoder(strings)
    parts = encode_parts(tokens, annotations)
    starts = encode_starts(tokens, annotations)
    return charemb, wordemb, parts, starts


charembs, wordembs, parts, starts = (
    F(map, process_pair) >> 
    (lambda x: zip(*x)) >>
    (map, np.array)
)(tokenised, annotations)

#### 3. Separate datasets for training and validation

In [None]:
indices = np.arange(len(texts))
np.random.shuffle(indices)

trainsplit = 0.8
train_split = indices[:int(len(indices)*trainsplit)]
val_split = indices[int(len(indices)*trainsplit):]

#### 4. Binpack and chunk the datasets;

In [None]:
chunksize = 128
batchsize = 32

charembs_train, wordembs_train, parts_train, starts_train = [
    arr[train_split] for arr in [charembs, wordembs, parts, starts]
]
charembs_val, wordembs_val, parts_val, starts_val = [
    arr[val_split] for arr in [charembs, wordembs, parts, starts]
]

bins_train, bins_val = map(F(binning.binpack, batchsize, len), [parts_train, parts_val])

x_char_train, x_word_train, y_parts_train, y_starts_train = map(
    F(binning.merge_bins, bins=bins_train) >> F(preprocessing.chunksteps, chunksize), 
    [charembs_train, wordembs_train, parts_train, starts_train]
)
x_char_val, x_word_val, y_parts_val, y_starts_val = map(
    F(binning.merge_bins, bins=bins_val) >> F(preprocessing.chunksteps, chunksize), 
    [charembs_val, wordembs_val, parts_val, starts_val]
)

In [None]:
x_char_train.shape, x_word_train.shape

#### 5. Build and train the network, save the results.

In [None]:
chunksize = 128
batchsize = 32
wordemb_dim = glove_embeddings.shape[1]
charemb_dim = 32
charemb_units = 16

# character block
inputs_char = layers.Input(batch_shape=(batchsize, chunksize, wordlen))
char_embeddings = blocks.charemb(maxchar+1, chunksize, charemb_dim,
                                 charemb_units, 0.3, 0.3, mask=False,
                                 layer=layers.GRU)(inputs_char)
char_conv = blocks.cnn([256, 256], 3, [0.3, None], 
                       name_template='narrowcharconv{}')(char_embeddings)

# word block
inputs_word = layers.Input(batch_shape=(batchsize, chunksize, wordemb_dim))
word_conv = blocks.cnn([256, 256], 3, [0.3, None], 
                       name_template='narrowwordconv{}')(inputs_word)

# join CNN-extracted features and reshape
def reshape(shape, layer):
    return layers.Lambda(
        lambda incomming: K.reshape(incomming, shape=shape)
    )(layer)

feat_shape = [batchsize, chunksize, -1]
feat_layers = [char_conv, word_conv]
features = reshape(feat_shape, layers.concatenate(feat_layers, axis=-1))

# RNN-blocks shared by two output nodes
rnn_shared1 = wrappers.HalfStatefulBidirectional(
    layers.GRU(32, stateful=True, dropout=0.3, recurrent_dropout=0.3,
               return_sequences=True))
rnn_shared2 = wrappers.HalfStatefulBidirectional(
    layers.GRU(32, stateful=True, dropout=0.3, recurrent_dropout=0.3,
               return_sequences=True))
shared_rnn_layers = [rnn_shared1, rnn_shared2]
rnn_shared = reduce(lambda graph, layer: layer(graph), shared_rnn_layers,
                    features)

# separate branch for parts-detection 
rnn_parts = wrappers.HalfStatefulBidirectional(
    layers.GRU(32, stateful=True, dropout=0.3, recurrent_dropout=0.3,
               return_sequences=True))
labels_parts = layers.Dense(1, activation='sigmoid')(rnn_parts(rnn_shared))
attention = reshape(feat_shape, layers.multiply([labels_parts, rnn_shared]))
# separate branch for starts-detection 
rnn_starts = wrappers.HalfStatefulBidirectional(
    layers.GRU(32, stateful=True, dropout=0.3, recurrent_dropout=0.3,
               return_sequences=True))
labels_starts = layers.Dense(1, activation='sigmoid')(rnn_starts(attention))

# compile the model
model = models.Model([inputs_char, inputs_word], [labels_parts, labels_starts])
model.compile(optimizer='Adam', loss='binary_crossentropy')
stateful_layers = [*shared_rnn_layers, rnn_parts, rnn_starts]

In [None]:
# reshape input and output data for training and validation
inputs = [np.vstack(x_char_train), np.vstack(x_word_train)]
output_parts = np.clip(np.vstack(y_parts_train), 0, 1)[:,:,None]
output_starts = np.clip(np.vstack(y_starts_train), 0, 1)[:,:,None]
inputs_val = [np.vstack(x_char_val), np.vstack(x_word_val)]
output_parts_val = np.clip(np.vstack(y_parts_val), 0, 1).flatten()
output_starts_val = np.clip(np.vstack(y_starts_val), 0, 1).flatten()

# since our networks have stateful layers, we need to reset them between epochs.
def reset_stateful_layers(layers, _):
    for layer in layers:
        layer.reset_states()

resetter = callbacks.Caller({'on_epoch_begin': [F(reset_stateful_layers, stateful_layers)], 
                             'on_epoch_end': [F(reset_stateful_layers, stateful_layers)]})

# prepare the validators; the validators will keep track of the F1 scores on the validation dataset 
# and save model weights upon performance improvements
! mkdir -p trainlogs

scores = {"precision": F(metrics.precision_score, average="binary", labels=[1]),
          "recall": F(metrics.recall_score, average="binary", labels=[1]),
          "f1": F(metrics.f1_score, average="binary", labels=[1])}

logfile = open(f"trainlogs/ner.log", "w")
validator_parts = callbacks.Validator(inputs_val, output_parts_val, batchsize, scores, 
                                      lambda pred: (np.vstack(pred[0]) > 0.5).astype(int).flatten(), 
                                      "f1", prefix=f"trainlogs/ner-parts", stream=logfile)
validator_starts = callbacks.Validator(inputs_val, output_starts_val, batchsize, scores, 
                                       lambda pred: (np.vstack(pred[1]) > 0.5).astype(int).flatten(), 
                                       "f1", prefix=f"trainlogs/ner-starts", stream=logfile)

In [None]:
# start training
model.fit(inputs, [output_parts, output_starts],
          verbose=1, epochs=50, batch_size=batchsize,
          initial_epoch=0, callbacks=[resetter, validator_parts, resetter, validator_starts])