In [1]:
from itertools import starmap, chain
from collections import Counter
import operator as op

import numpy as np
from fn import F

from chempred import chemdner
from chempred import preprocessing as pp
# from chempred import model

### Read and preprocess data

In [9]:
nonpos = 3
flanking = False
window = 5
maxlen = 500
class_mapping = {
    "OTHER": 0,
    "ABBREVIATION": 1,
    "FAMILY": 2,
    "FORMULA": 3,
    "IDENTIFIER": 4,
    "MULTIPLE": 5,
    "NO CLASS": 6,
    "SYSTEMATIC": 7,
    "TRIVIAL": 8
}
binary_class_mapping = {cls: 0 if cls == "OTHER" else 1 for cls in class_mapping}
positive_classes = {cls for cls in class_mapping if cls != "OTHER"}

abstracts = chemdner.read_abstracts("chemdner_corpus/training.abstracts.txt")
abstract_annotations = chemdner.read_annotations("chemdner_corpus/training.annotations.txt")
aligned = list(chemdner.align_abstracts_and_annotations(abstracts, abstract_annotations))
data = (F(map, chemdner.flatten_aligned_pair) >> chain.from_iterable >> list)(aligned)
nonempty = [(id_, src, text, annotations) 
            for id_, src, text, annotations in data if annotations]
ids = [id_ for id_, *_ in nonempty]
texts = [text for *_, text, _ in nonempty]
text_annotations = [chemdner.annotate_text(text, annotations, src, True) 
                    for _, src, text, annotations in nonempty]

targets = [pp.sample_targets(positive_classes, annotations, nonpos) 
           for annotations in text_annotations]
sampler = pp.make_sampler(maxlen=maxlen, width=window, flanking=flanking)
samples_and_failures = (F(zip) 
                        >> (starmap, F(pp.sample_windows, sampler=sampler))
                        >> list)(targets, text_annotations)
samples = list(map(op.itemgetter(0), samples_and_failures))
failures = list(map(op.itemgetter(1), samples_and_failures))

encoded_texts = [[pp.encode_text(text, sample) for sample in samples_] 
                for text, samples_ in zip(texts, samples)]
encoded_classes = [[pp.encode_classes(class_mapping, sample) for sample in samples_] 
                   for text, samples_ in zip(texts, samples)]

In [8]:
encoded_classes[0]

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
# TODO rename generate_training_samples
joined_samples_train, joined_cls_train = pp.join_tokens_in_samples(
    *pp.generate_training_samples(annotated_abstracts_train, pos_cls, window, False))
samples_train, cls_train = map(len_filter, [joined_samples_train, joined_cls_train])
joined_samples_test, joined_cls_test = pp.join_tokens_in_samples(
    *pp.generate_training_samples(annotated_abstracts_test, pos_cls, window, False))
samples_test, cls_test = map(len_filter, [joined_samples_test, joined_cls_test])
# max_sample_len = max(map(len, samples_train+samples_test))

padded_samples_train, padded_cls_train, masks_train = pp.pad(samples_train, cls_train, 
                                                             max_sample_len)
padded_samples_test, padded_cls_test, masks_test = pp.pad(samples_test, cls_test, 
                                                          max_sample_len)

# onehot_samples_train, onehot_cls_train = map(
#     pp.encode_one_hot, [padded_samples_train, padded_cls_train])
# masked_samples_train = pp.mask_array(onehot_samples_train, masks_train)
# masked_cls_train = pp.mask_array(onehot_cls_train, masks_train)

# onehot_samples_test, onehot_cls_test = map(
#     pp.encode_one_hot, [padded_samples_test, padded_cls_test])
# masked_samples_test = pp.mask_array(onehot_samples_test, masks_test)
# masked_cls_test = pp.mask_array(onehot_cls_test, masks_test)

onehot_cls_train = pp.encode_one_hot(padded_cls_train)
masked_cls_train = pp.mask_array(onehot_cls_train, masks_train)

onehot_cls_test = pp.encode_one_hot(padded_cls_test)
masked_cls_test = pp.mask_array(onehot_cls_test, masks_test)

In [20]:
padded_samples_train.shape

(143616, 500)

In [8]:
# class_counts = np.vstack([np.unique(cls, return_counts=True)[1] for cls in 
#                           (padded_cls[mask] for padded_cls, mask in zip(padded_classes, masks))])
# class_counts[:10]

array([[34, 11],
       [36, 11],
       [33, 11],
       [29, 11],
       [29, 11],
       [26,  9],
       [24,  9],
       [24,  9],
       [18,  9],
       [22,  9]])

In [9]:
len(joined_samples_train), padded_samples_train.shape, len(joined_samples_test), padded_samples_test.shape

(143616, (143616, 800), 123380, (123380, 800))

In [10]:
from keras import backend as k
from keras import losses
from keras import models
from keras import layers
from keras import callbacks

In [None]:
maxlen = padded_samples_train.shape[1]
nchar = 256
ncls = 2
nrec = 2
batchsize = 200

l_in = layers.Input(shape=(maxlen,), name="l_in")
l_emb = layers.Embedding(nchar, 50, mask_zero=True, input_length=maxlen)(l_in)

# model = models.Model(l_in, l_emb)

# model.predict(padded_samples_train[:1]).shape

# # l_mask = layers.Masking(mask_value=0, name="l_mask")(l_in)
l_rec = prototype.build_rec([200, 200], [0, 0.1], [0, 0.1])(l_emb)
l_out = layers.TimeDistributed(
    layers.Dense(ncls, activation='softmax'), name="l_out")(l_rec)
model = models.Model(l_in, l_out)
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics = ["accuracy"])

filepath = "models/emb-length-800/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = callbacks.ModelCheckpoint(filepath, monitor="val_acc", verbose=1, 
                                       save_best_only=True, mode="max")
# # reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3,min_lr=0.0001)

callbacks_ = [checkpoint]

model.fit(padded_samples_train, masked_cls_train, batch_size=batchsize, epochs=30, verbose=1,
          validation_data=(padded_samples_test, masked_cls_test), callbacks=callbacks_)

Epoch 1/30
 13000/143616 [=>............................] - ETA: 7319s - loss: 0.5395 - acc: 0.7239

In [43]:
print(test2_x_encoded[:1].argmin(axis=2))
print(net2.predict(test2_x_encoded[:1]).argmax(axis=2))

[[0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
