https://gist.github.com/dirko/1d596ca757a541da96ac3caa6f291229

http://dirko.github.io/Bidirectional-LSTMs-with-Keras/

In [59]:
# author: Keras==1.0.6
# mine: Keras==1.2.1
import numpy as np
import pandas as pd
import re

from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import TimeDistributedDense, Activation
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.layers import Merge
from keras.backend import tf
from keras.metrics import fmeasure # removed in Kereas 2.0 

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.cross_validation import train_test_split

from xml.etree import ElementTree
from xml.dom import minidom

from lambdawithmask import Lambda as MaskLambda

In [2]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
def reverse_func(x, mask=None):
    return tf.reverse(x, [False, True, False])

def score1(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [60]:
def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

In [3]:
raw = open('train.csv', 'r').readlines()
all_x = []
point = []
for line in raw:
    stripped_line = line.strip().split(',')
    point.append(stripped_line)
    if line == '""\r\n':
#         print "newline"
        all_x.append(point[:-1])
        point = []
all_x = all_x[:-1]
lengths = [len(x) for x in all_x]
# short_x = [x for x in all_x if len(x) < 64]

# split long sections into chucks (a mimic of sentences)
short_x = []
for l in all_x:
    short_x.extend(chunks(l, 64))

In [4]:
len(short_x)

3428

In [5]:
X = [[c[0] for c in x] for x in short_x]
y = [[c[1] for c in y] for y in short_x]

In [6]:
all_text = [c for x in X for c in x]
words = list(set(all_text))
word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}
labels = list(set([c for x in y for c in x]))
# label2ind = {label: (index + 1) for index, label in enumerate(labels)}
# ind2label = {(index + 1): label for index, label in enumerate(labels)}
label2ind = {label: (index) for index, label in enumerate(labels)}
ind2label = {(index): label for index, label in enumerate(labels)}
print 'Input sequence length range: ', max(lengths), min(lengths)

Input sequence length range:  4741 58


In [7]:
ind2label

{0: 'Severity',
 1: 'Negation',
 2: 'O',
 3: 'DrugClass',
 4: 'Animal',
 5: 'Factor',
 6: 'AdverseReaction'}

In [8]:
maxlen = max([len(x) for x in X])
print 'Maximum sequence length:', maxlen

Maximum sequence length: 64


In [9]:
X_enc = [[word2ind[c] for c in x] for x in X]
X_enc_reverse = [[c for c in reversed(x)] for x in X_enc]
max_label = max(label2ind.values()) + 1
# max_label = max(label2ind.values())
y_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y]
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]

In [10]:
max_label

7

In [11]:
y_enc[0][0]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.])

In [12]:
X_enc_f = pad_sequences(X_enc, maxlen=maxlen)
X_enc_b = pad_sequences(X_enc_reverse, maxlen=maxlen)
y_enc = pad_sequences(y_enc, maxlen=maxlen)

In [13]:
(X_train_f, X_test_f, X_train_b,
 X_test_b, y_train, y_test) = train_test_split(X_enc_f, X_enc_b, y_enc,
                                               test_size=11*32, train_size=45*32, random_state=42)
print 'Training and testing tensor shapes:'
print X_train_f.shape, X_test_f.shape, X_train_b.shape, X_test_b.shape, y_train.shape, y_test.shape

Training and testing tensor shapes:
(1440, 64) (352, 64) (1440, 64) (352, 64) (1440, 64, 7) (352, 64, 7)


In [14]:
max_features = len(word2ind)
embedding_size = 128
hidden_size = 32
out_size = len(label2ind) #+ 1

In [15]:
model_forward = Sequential()
model_forward.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model_forward.add(LSTM(hidden_size, return_sequences=True))  

In [16]:
model_backward = Sequential()
model_backward.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model_backward.add(LSTM(hidden_size, return_sequences=True))
model_backward.add(MaskLambda(function=reverse_func, mask_function=reverse_func))

In [40]:
model = Sequential()

model.add(Merge([model_forward, model_backward], mode='concat'))
model.add(TimeDistributedDense(out_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[fmeasure])

In [41]:
batch_size = 64 #32
model.fit([X_train_f, X_train_b], y_train, batch_size=batch_size, nb_epoch=15, # started from 40
          validation_data=([X_test_f, X_test_b], y_test))
score = model.evaluate([X_test_f, X_test_b], y_test, batch_size=batch_size)
print('Raw test score:', score)

Train on 1440 samples, validate on 352 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
('Raw test score:', [0.17060929672284561, 0.94905072992498229])


In [44]:
pr = model.predict_classes([X_train_f, X_train_b])
yh = y_train.argmax(2)
fyh, fpr = score1(yh, pr)
print 'Training Micro F1:', f1_score(fyh, fpr, average='micro')
print 'Training confusion matrix:'
print confusion_matrix(fyh, fpr)
print

# calculate F1 without class 'O'
NO_inds = [i for i,v in enumerate(fpr) if v != 2]
fyh = [fyh[i] for i in NO_inds]
fpr = [fpr[i] for i in NO_inds]
print 'Training Micro F1 (without Class O):', f1_score(fyh, fpr, average='micro')
print 'Training confusion matrix:'
print confusion_matrix(fyh, fpr)

Training Micro F1: 0.997880615523
Training confusion matrix:
[[  339     0    26     0     0     0     2]
 [    0     0    33     0     0     7     2]
 [    7     0 82132     0     0     3     9]
 [    6     0    39    45     0     3     6]
 [    0     0     7     0     4     2     1]
 [    1     0    24     0     0   239     2]
 [    1     0     7     0     0     0  5758]]

Training Micro F1 (without Class O): 0.991921702657
Training confusion matrix:
[[ 339    0    0    0    0    0    2]
 [   0    0    0    0    0    7    2]
 [   7    0    0    0    0    3    9]
 [   6    0    0   45    0    3    6]
 [   0    0    0    0    4    2    1]
 [   1    0    0    0    0  239    2]
 [   1    0    0    0    0    0 5758]]


In [45]:
pr = model.predict_classes([X_test_f, X_test_b])
yh = y_test.argmax(2)
fyh, fpr = score1(yh, pr)
print 'Training Micro F1:', f1_score(fyh, fpr, average='micro')
print 'Training confusion matrix:'
print confusion_matrix(fyh, fpr)
print

# calculate F1 without class 'O'
NO_inds = [i for i,v in enumerate(fpr) if v != 2]
fyh = [fyh[i] for i in NO_inds]
fpr = [fpr[i] for i in NO_inds]
print 'Training Micro F1 (without Class O):', f1_score(fyh, fpr, average='micro')
print 'Training confusion matrix:'
print confusion_matrix(fyh, fpr)

Training Micro F1: 0.972244140172
Training confusion matrix:
[[   40     0    37     0     0     0     5]
 [    0     0    12     0     0     1     0]
 [   34     0 19763     0     0    12   131]
 [    2     0    21     1     0     1    10]
 [    0     0     1     0     0     1     1]
 [    0     0    27     0     0    11     0]
 [    2     0   300     0     0     0  1132]]

Training Micro F1 (without Class O): 0.85549132948
Training confusion matrix:
[[  40    0    0    0    0    0    5]
 [   0    0    0    0    0    1    0]
 [  34    0    0    0    0   12  131]
 [   2    0    0    1    0    1   10]
 [   0    0    0    0    0    1    1]
 [   0    0    0    0    0   11    0]
 [   2    0    0    0    0    0 1132]]


In [21]:
ind2label

{0: 'Severity',
 1: 'Negation',
 2: 'O',
 3: 'DrugClass',
 4: 'Animal',
 5: 'Factor',
 6: 'AdverseReaction'}

### Conclusion: 
nb_epoch=15 is the best so far

# Label the predictions

In [46]:
doc = "Classical HL post-auto-HSCT consolidation: neutropenia, peripheral sensory neuropathy, thrombocytopenia, anemia, upper respiratory tract infection, fatigue, peripheral motor neuropathy, nausea, cough, and diarrhea."
x_new = [[m.group(0) for m in re.finditer(r'\w+', doc)]] # tokenize words only

X_new = [[word2ind[c] for c in x] for x in x_new]
X_new_reverse = [[c for c in reversed(x)] for x in X_new]
X_new_f = pad_sequences(X_new, maxlen=maxlen)
X_new_b = pad_sequences(X_new_reverse, maxlen=maxlen)
print X_new_f.shape, X_new_b.shape

(1, 64) (1, 64)


In [47]:
pr = model.predict_classes([X_new_f, X_new_b])
pr



array([[2, 2, 2, 2, 0, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 6, 6, 6, 0, 6, 6, 0,
        2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,
        2, 2, 6, 6, 6, 6, 6, 6, 2, 6, 2, 6, 6, 6, 2, 6, 6, 6, 2, 6]])

In [48]:
for w, p in zip(x_new[0], pr.tolist()[0][-len(X_new[0]):]):
    print w, ind2label[p]

Classical O
HL O
post O
auto O
HSCT O
consolidation O
neutropenia AdverseReaction
peripheral AdverseReaction
sensory AdverseReaction
neuropathy AdverseReaction
thrombocytopenia AdverseReaction
anemia AdverseReaction
upper O
respiratory AdverseReaction
tract O
infection AdverseReaction
fatigue AdverseReaction
peripheral AdverseReaction
motor O
neuropathy AdverseReaction
nausea AdverseReaction
cough AdverseReaction
and O
diarrhea AdverseReaction


### Issues:
* Cannot identify names with more than one word (how?)
* Two classes are missing in training data (why?)
* How to calculate and improve F1 score? Overfitting? How to tune parameters?
* Imbalanced dataset?

# Write submission file

https://bionlp.nlm.nih.gov/tac2017adversereactions/

In [49]:
testpath = '/Users/jzhu/git/nlp_adversedrug/data/unannotated_xml'
outpath = '/Users/jzhu/git/nlp_adversedrug/data/labeled_unannotated_xml'

In [66]:
test = pd.read_csv('test.csv', header=None)
test.columns = ['file', 'section', 'word', 'start', 'end']
test.shape

(3594380, 5)

In [67]:
test.head()

Unnamed: 0,file,section,word,start,end
0,8MOP,S2,BOXED,6,11
1,8MOP,S2,WARNING,12,19
2,8MOP,S2,Methoxsalen,25,36
3,8MOP,S2,with,37,41
4,8MOP,S2,UV,42,44


In [72]:
all_sec = test[['file', 'section']].drop_duplicates().reset_index(drop=True)
all_sec.shape

(4311, 2)

In [92]:
for index, row in all_sec.iterrows():
    temp = test[(test['file'] == row['file']) & (test['section'] == row['section'])]
    x_new = temp['word']
    
    X_new = [[word2ind[c] if c in word2ind else 0 for c in x] for x in x_new]
    X_new_reverse = [[c for c in reversed(x)] for x in X_new]
    X_new_f = pad_sequences(X_new, maxlen=maxlen)
    X_new_b = pad_sequences(X_new_reverse, maxlen=maxlen)
    
    pr = model.predict_classes([X_new_f, X_new_azb]).tolist()
#     print temp['word']
    for w, p in zip(x_new, pr[-len(X_new):]):
        print w, ind2label[p]
    break

TypeError: argument to reversed() must be a sequence

In [90]:
len(X_new_f[0])

64