In [1]:
%matplotlib inline

from itertools import starmap, chain
from collections import Counter

import numpy as np
import ggplot
from fn import F

from chempred.chemdner import read_abstracts, read_annotations, pair, annotate_abstract
from chempred import preprocessing as pp

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
abstracts = read_abstracts("chemdner_corpus/training.abstracts.txt")
annotations = read_annotations("chemdner_corpus/training.annotations.txt")
print(len(abstracts), len(annotations))
pairs = list(zip(abstracts, pair(abstracts, annotations)))
pairs[1]

3500 2916


((22080034,
  'Nanosilver effects on growth parameters in experimental aflatoxicosis in broiler chickens.',
  'Aflatoxicosis is a cause of economic losses in broiler production. In this study, the effect of one commercial nanocompound, Nanocid (Nano Nasb Pars Co., Iran) was evaluated in reduction of aflatoxin effects on the growth and performance indices in broiler chickens suffering from experimental aflatoxicosis. For this, a total of 300 one-day-old broiler chicks (Ross strain) were randomly divided into 4 groups with 3 replicates of 15 chicks in each separated pen during the 28-day experiment. Treatment groups including group A: chickens fed basal diet, group B: chickens fed 3 ppm productive aflatoxin in basal diet, group C: chickens fed basal diet plus 2500 ppm Nanocid, and group D: chickens fed 3 ppm productive aflatoxin and 2500 ppm Nanocid, in basal diet. Data on body weight, body weight gain (BWG), feed intake, and feed conversion ratio (FCR) were recorded at weekly intervals.

In [3]:
annotated_abstracts = list(starmap(F(annotate_abstract, guided=True), pairs))
annotated_abstracts[1]

[('T', 0, 10, 'Nanosilver', 'OTHER'),
 ('T', 11, 18, 'effects', 'OTHER'),
 ('T', 19, 21, 'on', 'OTHER'),
 ('T', 22, 28, 'growth', 'OTHER'),
 ('T', 29, 39, 'parameters', 'OTHER'),
 ('T', 40, 42, 'in', 'OTHER'),
 ('T', 43, 55, 'experimental', 'OTHER'),
 ('T', 56, 69, 'aflatoxicosis', 'OTHER'),
 ('T', 70, 72, 'in', 'OTHER'),
 ('T', 73, 80, 'broiler', 'OTHER'),
 ('T', 81, 90, 'chickens.', 'OTHER'),
 ('A', 0, 13, 'Aflatoxicosis', 'OTHER'),
 ('A', 14, 16, 'is', 'OTHER'),
 ('A', 17, 18, 'a', 'OTHER'),
 ('A', 19, 24, 'cause', 'OTHER'),
 ('A', 25, 27, 'of', 'OTHER'),
 ('A', 28, 36, 'economic', 'OTHER'),
 ('A', 37, 43, 'losses', 'OTHER'),
 ('A', 44, 46, 'in', 'OTHER'),
 ('A', 47, 54, 'broiler', 'OTHER'),
 ('A', 55, 66, 'production.', 'OTHER'),
 ('A', 67, 69, 'In', 'OTHER'),
 ('A', 70, 74, 'this', 'OTHER'),
 ('A', 75, 81, 'study,', 'OTHER'),
 ('A', 82, 85, 'the', 'OTHER'),
 ('A', 86, 92, 'effect', 'OTHER'),
 ('A', 93, 95, 'of', 'OTHER'),
 ('A', 96, 99, 'one', 'OTHER'),
 ('A', 100, 110, 'commercia

In [4]:
positive_classes = Counter(r[-1] for r in chain.from_iterable(anno for _, anno in annotations))
positive_classes

Counter({'ABBREVIATION': 4538,
         'FAMILY': 4090,
         'FORMULA': 4448,
         'IDENTIFIER': 672,
         'MULTIPLE': 202,
         'NO CLASS': 40,
         'SYSTEMATIC': 6656,
         'TRIVIAL': 8832})

In [5]:
raw_samples, raw_classes = pp.generate_training_samples(
    annotated_abstracts, positive_classes, 5, False)
padded_samples, padded_classes, masks = pp.pad(*pp.join_tokens_in_samples(raw_samples, raw_classes))
one_hot_samples, one_hot_classes = map(pp.encode_one_hot, [padded_samples, padded_classes])
masked_samples = pp.mask_array(one_hot_samples, masks)
masked_classes = pp.mask_array(one_hot_classes, masks)
class_counts = np.vstack([np.unique(cls, return_counts=True)[1] for cls in 
                          (padded_cls[mask] for padded_cls, mask in zip(padded_classes, masks))])

In [8]:
class_counts[:10]

array([[34, 11],
       [36, 11],
       [33, 11],
       [29, 11],
       [29, 11],
       [26,  9],
       [24,  9],
       [24,  9],
       [18,  9],
       [22,  9]])

In [9]:
masked_samples.shape

(143616, 299, 240)

In [11]:
from keras import backend as k
from keras import losses
from keras import models
from keras import layers

Using TensorFlow backend.


In [None]:
n_steps = 200
maxlen = masked_samples.shape[1]
nchar = masked_samples.shape[2]
n_cls = 2

l_in = layers.Input(shape=(maxlen, nchar), name="l_in")
l_mask = layers.Masking(mask_value=0, name="l_mask")(l_in)
l_rec1 = layers.Bidirectional(layers.LSTM(n_steps, return_sequences=True, name="l_rec1"))(l_mask)
l_rec2 = layers.Bidirectional(layers.LSTM(n_steps, return_sequences=True, name="l_rec2"))(l_rec1)
l_out = layers.TimeDistributed(
    layers.Dense(n_cls, activation='softmax'), name="l_out")(l_rec2)
net2 = models.Model(l_in, l_out)
net2.compile(optimizer='Adam', loss="binary_crossentropy", metrics = ['accuracy'])
net2.fit(masked_samples, masked_classes, batch_size=200, epochs=50, verbose=1, validation_split=0.1)

Train on 129254 samples, validate on 14362 samples
Epoch 1/50
  2400/129254 [..............................] - ETA: 2769s - loss: 0.6356 - acc: 0.6464

In [43]:
print(test2_x_encoded[:1].argmin(axis=2))
print(net2.predict(test2_x_encoded[:1]).argmax(axis=2))

[[0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
