In [1]:
from itertools import starmap, chain
from collections import Counter
import operator as op

import numpy as np
from keras import models
from keras import layers
from keras import callbacks
from fn import F

from chempred import chemdner
from chempred import preprocessing as pp
from chempred import model
from chempred import training

Using TensorFlow backend.


### Read and preprocess data

In [2]:
MODELS = "models"
ABSTRACTS = "abstracts"
ANNOTATIONS = "annotations"
DETECTOR = "detector"
TAGGER = "tagger"

NCHAR = pp.MAXCHAR + 1
EMBED = 50

In [3]:
detector = "testdata/config-detector.json"
config = training.read_config(detector)
if set(config.mapping.values()) != {0, 1}:
    raise ValueError("The detector's mapping must be binary")
ncls = 2

# read training data
train_abstracts = chemdner.read_abstracts(config.train_data[ABSTRACTS])
train_anno = chemdner.read_annotations(config.train_data[ANNOTATIONS])
train_ids, train_samples, train_fail, train_x, train_y, train_mask = (
    training.process_data(train_abstracts, train_anno, config.window,
                          config.maxlen, config.nonpositive,
                          config.mapping, config.positive)
)
# read testing data
test_abstracts = chemdner.read_abstracts(
    config.test_data[ABSTRACTS])
test_anno = chemdner.read_annotations(
    config.test_data[ANNOTATIONS])
test_ids, test_samples, test_fail, test_x, test_y, test_mask = (
    training.process_data(test_abstracts, test_anno, config.window,
                          config.maxlen, config.nonpositive,
                          config.mapping, config.positive)
)

train_y_onehot = pp.one_hot(train_y)
test_y_onehot = pp.one_hot(test_y)

In [6]:
train_x.shape, test_x.shape

((158, 500), (155, 500))

### Build a model

In [4]:
l_in = layers.Input(shape=(config.maxlen,), name="l_in")
l_emb = layers.Embedding(NCHAR, EMBED, mask_zero=True,
                         input_length=config.maxlen)(l_in)
l_rec = model.build_rec(config.nsteps, config.in_drop,
                        config.rec_drop)(l_emb)
l_out = layers.TimeDistributed(
    layers.Dense(ncls, activation='softmax'), name="l_out")(l_rec)
detector_model = models.Model(l_in, l_out)
detector_model.compile(optimizer="Adam", loss="binary_crossentropy",
                       metrics=["accuracy"])

In [None]:
with training.training("testdata", "model") as (destination, weights):
    # save architecture
    detector_json = detector_model.to_json()
    with open(destination, "w") as json_file:
        json_file.write(detector_json)
    checkpoint = callbacks.ModelCheckpoint(weights, monitor="val_acc",
                                           verbose=1, mode="max",
                                           save_best_only=True)
    detector_model.fit(train_x, train_y_onehot, callbacks=[checkpoint],
                       validation_data=(test_x, test_y_onehot), verbose=1,
                       epochs=config.epochs, batch_size=config.batchsize)

Train on 158 samples, validate on 158 samples
Epoch 1/30


### Examine class imbalance

In [16]:
# class_counts = [np.unique(cls, return_counts=True)[1] 
#                 for cls in chain.from_iterable(encoded_classes)]
# stacked_counts = np.vstack([counts if len(counts) == 2 else np.array([counts[0], 0]) 
#                             for counts in class_counts])
# stacked_counts.sum(axis=0)

In [43]:
print(test2_x_encoded[:1].argmin(axis=2))
print(net2.predict(test2_x_encoded[:1]).argmax(axis=2))

[[0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
