In [77]:
import sys
from sklearn.model_selection import KFold

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Layer
from keras.layers.merge import add
from keras_contrib.layers import CRF
from keras import backend as K

import tensorflow as tf
import tensorflow_hub as hub

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

def instances(fi):
    xseq = []
    yseq = []

    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line means the end of a sentence.
            # Return accumulated sequences, and reinitialize.
            yield xseq, yseq
            xseq = []
            yseq = []
            continue

        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xseq.append(item)

        # Append the label to the label sequence.
        yseq.append(fields[4])
    #return xseq, yseq

def load_data(fi):
    xtrain = []
    ytrain = []

    for line in fi:
        line = line.strip('\n')
        if not line:
            continue
        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xtrain.append(item)

        # Append the label to the label sequence.
        ytrain.append(fields[4])
    return xtrain, ytrain



def instances_pred(fi):
    xseq = []
    toks = []

    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line means the end of a sentence.
            # Return accumulated sequences, and reinitialize.
            yield xseq, toks
            xseq = []
            toks = []
            continue

        # Split the line with TAB characters.
        fields = line.split('\t')

        # Append the item features to the item sequence.
        # fields are:  0=sid, 1=form, 2=span_start, 3=span_end, 4=tag, 5...N = features
        item = fields[5:]
        xseq.append(item)

        # Append the label to the label sequence.
        toks.append([fields[0],fields[1],fields[2],fields[3]])


In [78]:

# Create a custom layer that allows us to update weights (lambda layers do not have trainable parameters!)

class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [2]:
sentences = []
tags = []

fi = open('results/1/train.cod')

# Read sentences from STDIN, and append them to the trainer.
for xseq, yseq in instances(fi):
    sentences.append(xseq)
    tags.append(yseq)
fi.close()
fi = open('results/1/train.cod')
(xtrain, ytrain) = load_data(fi)
x = pd.DataFrame(xtrain)
y = pd.DataFrame(ytrain)[0].values

max_len = 75
words = list(set(x[0].values))
words.append("ENDPAD")
n_words = len(words)
utags = list(set(y))
n_tags = len(utags)

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(utags)}

docs = [[w[0] for w in s] for s in sentences]

In [15]:
    X = []
    for seq in docs:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("PADword")
        X.append(new_seq)

    X = np.array(X)
    Y = [[tag2idx[t] for t in tag ]for tag in tags]
    Y = pad_sequences(maxlen=max_len, sequences=Y, padding="post", value=tag2idx["O"])
    Y = np.array(Y)
    Y = Y.reshape(Y.shape[0], Y.shape[1], 1)

In [4]:
print(X.shape)
print(Y.shape)

(5675, 75)
(5675, 75, 1)


In [5]:
sess = tf.Session()
K.set_session(sess)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

Instructions for updating:
Colocations handled automatically by placer.


W0514 11:19:10.190338 139685490308928 deprecation.py:323] From /home/santiago/.local/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [11]:
batch_size = 25
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x,    tf.string)),
                    "sequence_len": tf.constant(batch_size*[max_len])},
                      signature="tokens",
                      as_dict=True)["elmo"]

In [20]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

model = Model(input_text, out)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["categorical_accuracy"])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0514 11:36:47.485433 139685490308928 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [79]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = ElmoEmbeddingLayer()(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

model2 = Model(input_text, out)
model2.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

ValueError: Can not squeeze dim[1], expected a dimension of 1, got 75 for 'elmo_embedding_layer_1/Squeeze' (op: 'Squeeze') with input shapes: [?,75].

In [9]:
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 75)           0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 75, 1024)     0           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 75, 1024)     6295552     lambda_1[0][0]                   
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 75, 1024)     6295552     bidirectional_1[0][0]            
__________________________________________________________________________________________________
add_1 (Add

In [16]:
X_tr, X_val = X[:200*batch_size], X[-27*batch_size:]
y_tr, y_val = Y[:200*batch_size], Y[-27*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [21]:
print(np.array(X_tr).shape)
print(np.array(y_tr).shape)
print(np.array(X_val).shape)
print(np.array(y_val).shape)

(5000, 75)
(5000, 75, 1)
(675, 75)
(675, 75, 1)


In [75]:
X.shape

(5675, 75)

In [22]:
model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=3, verbose=1)

Train on 5000 samples, validate on 675 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f09b3500390>

In [34]:
for xseq,toks in instances_pred(open('results/2/test.cod')):
    #x = pd.DataFrame(xseq)
    #x_test_sent = [[word2idx.get(t[0],0) for t in xseq]]
    docs = [[t[0] for t in xseq]]
    #encoded_docs = t.texts_to_sequences(docs)
    #x_test_sent = pad_sequences(maxlen=max_len, sequences=encoded_docs, padding="post", value=vocab_size-1)
    test = []
    for seq in docs:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("PADword")
        test.append(new_seq)

    prediction = model.predict(np.array(test[0]))
    prediction_max = np.argmax(prediction[k], axis=-1)
    inside = False;
    for k in range(0,len(toks)) :
        y = utags[prediction_max[k]]
        (sid, form, offS, offE) = toks[k]
        
        if (y[0]=="B") :
            entity_form = form
            entity_start = offS
            entity_end = offE
            entity_type = y[2:]
            inside = True
        elif (y[0]=="I" and inside) :
            entity_form += " "+form
            entity_end = offE
        elif (y[0]=="O" and inside) :
            #print(sid, entity_start+"-"+entity_end, entity_form, entity_type, sep="|")
            print(sid + "|" + entity_start+"-"+entity_end + "|" +entity_form + "|" +entity_type + "\n")
            inside = False

    if inside : print(sid + "|" + entity_start+"-"+entity_end + "|" + entity_form + "|" + entity_type + "\n")

ValueError: Error when checking input: expected input_3 to have shape (75,) but got array with shape (1,)

In [73]:
model.predict(np.array(test))

InvalidArgumentError: slice index 1 of dimension 0 out of bounds.
	 [[{{node lambda_3/module_apply_tokens/strided_slice_1}}]]