In [2]:
import re
import os
import numpy as np
import keras
from bs4 import BeautifulSoup
import wget

Using TensorFlow backend.


Open up the files and define the labels

In [2]:
with open('SemEval2010_task8_training/TRAIN_FILE.TXT','r') as f:
    s = f.read()
    sentences = s.splitlines()[0::4]
    x_train =[sentence.split('\t')[1].strip('"') for sentence in sentences]
    y_train = s.splitlines()[1::4]

with open('SemEval2010_task8_testing_keys/TEST_FILE_CLEAN.TXT','r') as f:
    s = f.read()
    x_test = []
    for line in s.splitlines():
        d = line.split('\t')[1].strip('"')
        x_test.append(d)

with open('../SemEval2010_task8_testing_keys/TEST_FILE_KEY.TXT','r') as f:
    s = f.read()
    y_test = [k.split('\t')[1] for k in s.splitlines()]

label_index = {'Cause-Effect': 0,
 'Component-Whole': 1,
 'Content-Container': 2,
 'Entity-Destination': 3,
 'Entity-Origin': 4,
 'Instrument-Agency': 5,
 'Member-Collection': 6,
 'Message-Topic': 7,
 'Other': 8,
 'Product-Producer': 9}

We need to preprocess the labels as well as the data

In [8]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
tokenizer = Tokenizer()
def clean_text(sentence_list): 
    soups = [BeautifulSoup(sen,"html5lib") for sen in sentence_list]
    clean_text = [soup.text for soup in soups]
    e1_words = [soup.find('e1').text.lower() for soup in soups]
    e2_words = [soup.find('e2').text.lower() for soup in soups]
    clean_text = [' '.join(text_to_word_sequence(clean)) for clean in clean_text]
    return [clean_text,e1_words,e2_words]

In [9]:
MAX_LEN = 85
def pad(sequence):
    return keras.preprocessing.sequence.pad_sequences(sequence,maxlen=MAX_LEN)

def preprocess(params, tokenizer = tokenizer):  
    [clean_train, e1, e2] = params
    token_sentences = tokenizer.texts_to_sequences(clean_train)
    token_e1 = tokenizer.texts_to_sequences(e1)
    token_e2 = tokenizer.texts_to_sequences(e2)
    e1_one_hots = []
    for sentence,e1 in zip(token_sentences,token_e1):
        s1 = []
        for word in sentence:
            if word in e1:
                s1.append(1)
            else:
                s1.append(0)
        e1_one_hots.append(s1)

    e2_one_hots = []
    for sentence,e2 in zip(token_sentences,token_e2):
        s1 = []
        for word in sentence:
            if word in e2:
                s1.append(1)
            else:
                s1.append(0)
        e2_one_hots.append(s1)
    padded_sentences = pad(token_sentences)
    padded_e1 = pad(token_e1)
    padded_e2 = pad(token_e2)
    return [padded_sentences, padded_e1, padded_e2]

def preprocess_labels(target, label_index=label_index):
    reduced_labels = [t.replace('(e2,e1)','') for t in target]
    reduced_labels = [t.replace('(e1,e2)','') for t in reduced_labels]
    labels = [label_index[i] for i in reduced_labels]
    labels = keras.utils.to_categorical(labels)
    return labels

In [11]:
x_train = clean_text(x_train)
tokenizer.fit_on_texts(x_train[0])

x_train = preprocess(x_train)
x_test = preprocess(clean_text(x_test))

y_train = preprocess_labels(y_train)
y_test = preprocess_labels(y_test)

Download a word embeddings Matrix (GLOVE)

In [5]:
GLOVE_URL = 'https://drive.google.com/uc?export=download&id=0B30g1WfHiiY-QloxTldQTkxVelU'
if not 'glove.6B.100d.txt' in os.listdir('.'):
    print("word2vec matrix not found. Downloading...")
    wget.download(GLOVE_URL)
else:
    print("Loading word2vec matrix...")

Loading word2vec matrix


In [14]:
embeddings_index = {}
f = open('glove.6B.100d.txt'),'rb')
for line in f:
    values = line.split()
    word = values[0].decode('utf-8')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
print("Train Data:")
print("Sentences: {}".format(x_train[0].shape))
print("Labels: {}".format(y_train.shape))

print("Test Data:")
print("Sentences: {}".format(x_test[0].shape))
print("Labels: {}".format(y_train.shape))

Train Data:
Sentences: (8000, 85)
Labels: (8000, 10)
Test Data:
Sentences: (2717, 85)
Labels: (8000, 10)


# LSTM RNN Model with e1 and e2 as aux inputs

The sentence input is being passed along with e1 and e2 as auxillary vectors. 

Eg: 

```arrayed <e1>configuration</e1> of antenna <e2>elements</e2>
``` 

will be sent as 

|  |   |   |   |   |   
|---|---|---|---|---|---
||arrayed|configuration|of|antenna|elements|
|*token*|10|44|5|24|104|
|*e1*|0|1|0|0|0|
|*e2*|0|0|0|0|1|


In [35]:
sentence_input = keras.layers.Input(shape=(MAX_LEN,), name="sentence_input")
e1 = keras.layers.Input(shape=(MAX_LEN,), name="e1_input")
e1r = keras.layers.Reshape((MAX_LEN,1), name="e1_reshape")(e1)
e2 = keras.layers.Input(shape=(MAX_LEN,), name="e2_input")
e2r = keras.layers.Reshape((MAX_LEN,1), name="e2_reshape")(e2)
embed = keras.layers.Embedding(len(word_index)+1,100,weights=[embedding_matrix],input_length=MAX_LEN, name="word2vec")
vector_sentence = embed(sentence_input)
merged = keras.layers.concatenate([vector_sentence,e1r,e2r], name="merge")
x = keras.layers.Dropout(0.2)(merged)
x = keras.layers.LSTM(128,return_sequences=True,name="LSTM_1")(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.LSTM(32)(x)
x = keras.Dropout
prediction = keras.layers.Dense(10, activation='softmax', name="prediction")(x)
model = keras.models.Model([sentence_input,e1,e2],prediction)

In [36]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
sentence_input (InputLayer)      (None, 85)            0                                            
____________________________________________________________________________________________________
e1_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
e2_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
word2vec (Embedding)             (None, 85, 100)       1960600     sentence_input[0][0]             
___________________________________________________________________________________________

In [37]:
model.fit(x_train,y_train, validation_data=(x_test,y_test), epochs=50)

Train on 8000 samples, validate on 2717 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1680dd91f60>

# References
1. http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
2. https://keras.io/getting-started/functional-api-guide/
3. https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [40]:
sentence_input = keras.layers.Input(shape=(MAX_LEN,), name="sentence_input")
e1 = keras.layers.Input(shape=(MAX_LEN,), name="e1_input")
e1r = keras.layers.Reshape((MAX_LEN,1), name="e1_reshape")(e1)
e2 = keras.layers.Input(shape=(MAX_LEN,), name="e2_input")
e2r = keras.layers.Reshape((MAX_LEN,1), name="e2_reshape")(e2)
embed = keras.layers.Embedding(len(word_index)+1,100,weights=[embedding_matrix],input_length=MAX_LEN, name="word2vec")
vector_sentence = embed(sentence_input)
merged = keras.layers.concatenate([vector_sentence,e1r,e2r], name="merge")
x = keras.layers.Dropout(0.2)(merged)
x = keras.layers.LSTM(128,return_sequences=True,name="LSTM_1")(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.LSTM(32)(x)
x = keras.layers.Dropout(0.2)(x)
prediction = keras.layers.Dense(10, activation='softmax', name="prediction")(x)
model = keras.models.Model([sentence_input,e1,e2],prediction)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
sentence_input (InputLayer)      (None, 85)            0                                            
____________________________________________________________________________________________________
e1_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
e2_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
word2vec (Embedding)             (None, 85, 100)       1960600     sentence_input[0][0]             
___________________________________________________________________________________________

In [None]:
model.fit(x_train,y_train, validation_data=(x_test,y_test), epochs=50)

Train on 8000 samples, validate on 2717 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50