In [2]:
from itertools import starmap, chain
from collections import Counter
import operator as op

import numpy as np
from fn import F

from chempred import chemdner
from chempred import preprocessing as pp
from chempred import model

Using TensorFlow backend.


### Read and preprocess data

In [3]:
nonpos = 3
flanking = False
window = 5
maxlen = 500
class_mapping = {
    "OTHER": 0,
    "ABBREVIATION": 1,
    "FAMILY": 2,
    "FORMULA": 3,
    "IDENTIFIER": 4,
    "MULTIPLE": 5,
    "NO CLASS": 6,
    "SYSTEMATIC": 7,
    "TRIVIAL": 8
}
binary_class_mapping = {cls: 0 if cls == "OTHER" else 1 for cls in class_mapping}
positive_classes = {cls for cls in class_mapping if cls != "OTHER"}

abstracts = chemdner.read_abstracts("chemdner_corpus/training.abstracts.txt")
abstract_annotations = chemdner.read_annotations("chemdner_corpus/training.annotations.txt")
aligned = list(chemdner.align_abstracts_and_annotations(abstracts, abstract_annotations))
data = (F(map, chemdner.flatten_aligned_pair) >> chain.from_iterable >> list)(aligned)
nonempty = [(id_, src, text, annotations) 
            for id_, src, text, annotations in data if annotations]
ids = [id_ for id_, *_ in nonempty]
texts = [text for *_, text, _ in nonempty]
text_annotations = [chemdner.annotate_text(text, annotations, src, True) 
                    for _, src, text, annotations in nonempty]

targets = [pp.sample_targets(positive_classes, annotations, nonpos) 
           for annotations in text_annotations]
sampler = pp.make_sampler(maxlen=maxlen, width=window, flanking=flanking)
samples_and_failures = (F(zip) 
                        >> (starmap, F(pp.sample_windows, sampler=sampler))
                        >> list)(targets, text_annotations)
samples = list(map(op.itemgetter(0), samples_and_failures))
failures = list(map(op.itemgetter(1), samples_and_failures))

encoded_texts = [[pp.encode_text(text, sample) for sample in samples_] 
                for text, samples_ in zip(texts, samples)]
# encoded_classes = [[pp.encode_classes(class_mapping, sample) for sample in samples_] 
#                    for text, samples_ in zip(texts, samples)]
encoded_classes = [[pp.encode_classes(binary_class_mapping, sample) for sample in samples_] 
                   for text, samples_ in zip(texts, samples)]

joined_texts, masks_text = pp.join(list(chain.from_iterable(encoded_texts)))
joined_classes, masks_classes = pp.join(list(chain.from_iterable(encoded_classes)))

assert (masks_text == masks_classes).all()

joined_classes_onehot = pp.maskfalse(pp.one_hot(joined_classes), masks_classes)

### Examine class imbalance

In [16]:
class_counts = [np.unique(cls, return_counts=True)[1] 
                for cls in chain.from_iterable(encoded_classes)]
stacked_counts = np.vstack([counts if len(counts) == 2 else np.array([counts[0], 0]) 
                            for counts in class_counts])

In [18]:
stacked_counts.sum(axis=0)

array([4647000, 2084937])

In [5]:
from keras import backend as k
from keras import losses
from keras import models
from keras import layers
from keras import callbacks

maxlen = joined_texts.shape[1]
nchar = pp.MAXCHAR + 1
# ncls = len(class_mapping)
ncls = len(set(binary_class_mapping.values()))
batchsize = 400

l_in = layers.Input(shape=(maxlen,), name="l_in")
l_emb = layers.Embedding(nchar, 50, mask_zero=True, input_length=maxlen)(l_in)
l_rec = model.build_rec([200, 300], [0, 0.1], [0, 0.1])(l_emb)
l_out = layers.TimeDistributed(
    layers.Dense(ncls, activation='softmax'), name="l_out")(l_rec)
rnn = models.Model(l_in, l_out)
rnn.compile(optimizer="Adam", loss="categorical_crossentropy", metrics = ["accuracy"])

# filepath = "models/emb-length-800/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
# checkpoint = callbacks.ModelCheckpoint(filepath, monitor="val_acc", verbose=1, 
#                                        save_best_only=True, mode="max")
# reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3,min_lr=0.0001)

# callbacks_ = [checkpoint]



In [None]:
# rnn.fit(padded_samples_train, masked_cls_train, batch_size=batchsize, epochs=30, verbose=1,
#           validation_data=(padded_samples_test, masked_cls_test), callbacks=callbacks_)

rnn.fit(joined_texts, joined_classes_onehot, batch_size=batchsize, epochs=30, verbose=1,
          validation_split=0.2)

Train on 169445 samples, validate on 18828 samples
Epoch 1/30
  3400/169445 [..............................] - ETA: 4791s - loss: 1.4584 - acc: 0.6577

In [43]:
print(test2_x_encoded[:1].argmin(axis=2))
print(net2.predict(test2_x_encoded[:1]).argmax(axis=2))

[[0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
