In [151]:
#!/usr/bin/env python3

import sys
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

def instances(fi):
    xseq = []
    yseq = []

    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line means the end of a sentence.
            # Return accumulated sequences, and reinitialize.
            yield xseq, yseq
            xseq = []
            yseq = []
            continue

        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xseq.append(item)

        # Append the label to the label sequence.
        yseq.append(fields[4])
    #return xseq, yseq

def load_data(fi):
    xtrain = []
    ytrain = []

    for line in fi:
        line = line.strip('\n')
        if not line:
            continue
        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xtrain.append(item)

        # Append the label to the label sequence.
        ytrain.append(fields[4])
    return xtrain, ytrain



def instances_pred(fi):
    xseq = []
    toks = []
    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line means the end of a sentence.
            # Return accumulated sequences, and reinitialize.
            yield xseq, toks
            xseq = []
            toks = []
            continue

        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xseq.append(item)

        # Append the label to the label sequence.
        toks.append([fields[0],fields[1],fields[2],fields[3]])
    #return xseq, toks

def load_glove():
    # load the whole embedding into memory
    embeddings_index = dict()
    f = open('../../../data/glove.6B/glove.6B.100d.txt')
    for line in f:
    	values = line.split()
    	word = values[0]
    	coefs = np.asarray(values[1:], dtype='float32')
    	embeddings_index[word] = coefs
    f.close()
    return embeddings_index


In [135]:
sentences = []
tags = []

fi = open('results/1/train.cod')

# Read sentences from STDIN, and append them to the trainer.
for xseq, yseq in instances(fi):
    sentences.append(xseq)
    tags.append(yseq)
fi.close()
fi = open('results/1/train.cod')
(xtrain, ytrain) = load_data(fi)
x = pd.DataFrame(xtrain)
y = pd.DataFrame(ytrain)[0].values

max_len = 75
words = list(set(x[0].values))
words.append("ENDPAD")
n_words = len(words)
utags = list(set(y))
n_tags = len(utags)

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(utags)}

docs = [[w[0] for w in s] for s in sentences]

In [141]:
    t = Tokenizer(oov_token='<unw>')
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1
    encoded_docs = t.texts_to_sequences(docs)
    X = pad_sequences(maxlen=max_len, sequences=encoded_docs, padding="post", value=vocab_size-1)

    embeddings_index = load_glove()
    embedding_matrix = np.zeros((vocab_size, 100))
    #print(t.word_index.items())
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        #else:
        #    embedding_matrix[i] = embeddings_index.get("unknown")
    #Y = [[t for t in tag] for tag in tags]

    input = Input(shape=(max_len,))
    model = Embedding(input_dim=vocab_size, output_dim=100,
                      input_length=max_len, mask_zero=True, weights=[embedding_matrix],
                       trainable=False)(input)  # 20-dim embedding
    #model = Embedding(input_dim=n_words + 1, output_dim=450,
    #                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(LSTM(units=32, return_sequences=True,
                               recurrent_dropout=0.5))(model)  # variational biLSTM
    model = TimeDistributed(Dense(20, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output

    model = Model(input, out)

In [142]:
#Y = [[tag2idx[tag] for tag in y]]
Y = [[tag2idx[t] for t in tag ]for tag in tags]
Y = pad_sequences(maxlen=max_len, sequences=Y, padding="post", value=tag2idx["O"])
Y = [to_categorical(i, num_classes=n_tags) for i in Y]
print(np.array(Y).shape)

(198, 75, 8)


In [143]:
print(X.shape)
print(np.array(tagged[0]).shape)

(198, 75)
(4197,)


In [144]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

history = model.fit(X, np.array(Y), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)
hist = pd.DataFrame(history.history)



Train on 178 samples, validate on 20 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [145]:
hist

Unnamed: 0,val_loss,val_crf_viterbi_accuracy,loss,crf_viterbi_accuracy
0,0.258582,0.972,0.792232,0.897828
1,0.156056,0.972,0.239573,0.964419
2,0.133171,0.972,0.173716,0.964494
3,0.122984,0.972,0.156142,0.964494
4,0.11472,0.972,0.145783,0.964494


In [223]:
for xseq,toks in instances_pred(open('results/2/test.cod')):
    #x = pd.DataFrame(xseq)
    #x_test_sent = [[word2idx.get(t[0],0) for t in xseq]]
    docs = [t[0] for t in xseq]
    encoded_docs = t.texts_to_sequences(docs)
    x_test_sent = pad_sequences(maxlen=max_len, sequences=encoded_docs, padding="post", value=vocab_size-1)

    prediction = model.predict(np.array([x_test_sent[0]]))[0]
    prediction = np.argmax(prediction, axis=-1)
    inside = False;
    for k in range(0,len(toks)) :
        y = utags[prediction[k]]
        (sid, form, offS, offE) = toks[k]
        
        if (y[0]=="B") :
            entity_form = form
            entity_start = offS
            entity_end = offE
            entity_type = y[2:]
            inside = True
        elif (y[0]=="I" and inside) :
            entity_form += " "+form
            entity_end = offE
        elif (y[0]=="O" and inside) :
            #print(sid, entity_start+"-"+entity_end, entity_form, entity_type, sep="|")
            print(sid + "|" + entity_start+"-"+entity_end + "|" +entity_form + "|" +entity_type + "\n")
            inside = False

    if inside : print(sid + "|" + entity_start+"-"+entity_end + "|" + entity_form + "|" + entity_type + "\n")

hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola


hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola
hola


In [168]:
pred_sentences = []
tokens = []
for xseq,toks in instances_pred(open('results/2/test.cod')):
    pred_sentences.append(xseq)
    tokens.append(toks)
#(xseq, toks) = instances_pred(open('results/2/test.cod'))
#x = pd.DataFrame(xseq)
#x_test_sent = [[word2idx.get(t[0],0) for t in xseq]]
docs = [[w[0] for w in s] for s in pred_sentences]
encoded_docs = t.texts_to_sequences(docs)
x_test_sent = pad_sequences(maxlen=max_len, sequences=encoded_docs, padding="post", value=vocab_size-1)

prediction = model.predict(x_test_sent)
prediction_max = np.argmax(prediction, axis=-1)
inside = False;
for k in range(len(tokens)) :
    y = utags[prediction[k]]
    (sid, form, offS, offE) = toks[k]

    if (y[0]=="B") :
        entity_form = form
        entity_start = offS
        entity_end = offE
        entity_type = y[2:]
        inside = True
    elif (y[0]=="I" and inside) :
        entity_form += " "+form
        entity_end = offE
    elif (y[0]=="O" and inside) :
        #print(sid, entity_start+"-"+entity_end, entity_form, entity_type, sep="|")
        print(sid + "|" + entity_start+"-"+entity_end + "|" +entity_form + "|" +entity_type + "\n")
        inside = False

    if inside : print(sid + "|" + entity_start+"-"+entity_end + "|" + entity_form + "|" + entity_type + "\n")

TypeError: only integer scalar arrays can be converted to a scalar index

In [217]:
docs

[['No',
  'formal',
  'drug-drug',
  'interaction',
  'studies',
  'were',
  'conducted',
  '.'],
 ['In',
  'Trial',
  '1',
  ',',
  'the',
  'proportions',
  'of',
  'patients',
  'in',
  'the',
  'placebo',
  'and',
  'Synagis',
  'groups',
  'who',
  'received',
  'routine',
  'childhood',
  'vaccines',
  ',',
  'influenza',
  'vaccine',
  ',',
  'bronchodilators',
  'or',
  'corticosteroids',
  'were',
  'similar',
  'and',
  'no',
  'incremental',
  'increase',
  'in',
  'adverse',
  'reactions',
  'was',
  'observed',
  'among',
  'patients',
  'receiving',
  'these',
  'agents',
  '.'],
 ['There',
  'are',
  'so',
  'far',
  'no',
  'reported',
  'adverse',
  'interactions',
  'with',
  'SAMe',
  'and',
  'other',
  'drugs',
  ',',
  'dietary',
  'supplements',
  'or',
  'foods',
  '.'],
 ['-', 'Increased', 'action', 'of', 'oral', 'anticoagulants', '.'],
 ['-',
  'Decreased',
  'absorption',
  'of',
  'colestipol',
  ',',
  'cholestyramine',
  ',',
  'mineral',
  'oil',
  ',',
 

In [215]:
len(prediction[0])

75

In [221]:
prediction[0][0]

array([1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)