In [58]:
import re
import os
import numpy as np
import keras
from bs4 import BeautifulSoup
import sys
from six.moves.urllib.request import urlretrieve

Open up the files and define the labels

In [59]:
with open('SemEval2010_task8_training/TRAIN_FILE.TXT','r') as f:
    s = f.read()
    sentences = s.splitlines()[0::4]
    x_train =[sentence.split('\t')[1].strip('"') for sentence in sentences]
    y_train = s.splitlines()[1::4]

with open('SemEval2010_task8_testing_keys/TEST_FILE_CLEAN.TXT','r') as f:
    s = f.read()
    x_test = []
    for line in s.splitlines():
        d = line.split('\t')[1].strip('"')
        x_test.append(d)

with open('SemEval2010_task8_testing_keys/TEST_FILE_KEY.TXT','r') as f:
    s = f.read()
    y_test = [k.split('\t')[1] for k in s.splitlines()]

label_index = {'Cause-Effect': 0,
 'Component-Whole': 1,
 'Content-Container': 2,
 'Entity-Destination': 3,
 'Entity-Origin': 4,
 'Instrument-Agency': 5,
 'Member-Collection': 6,
 'Message-Topic': 7,
 'Other': 8,
 'Product-Producer': 9}

MAX_LEN = 85

# Preprocessing
We need to convert the data such that it makes more sense for a machine learning algorithm.

## Step 1 : Extract the e1 and e2 words

**Input :** `They saw that the <e1>equipment</e1> was put inside rollout <e2>drawers</e2>`.

**Output :** 

`They saw that the equipment was put inside rollout drawers.
[equipment, drawers]`

The sentence with the markup `<e1>.....</e1>` and `<e2>.....</e2>` are first converted to normal text. The e1 and e2 words are stored for each data sample in a separate array.

In [62]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
tokenizer = Tokenizer()
def clean_text(sentence_list): 
    soups = [BeautifulSoup(sen,"html5lib") for sen in sentence_list]
    clean_text = [soup.text for soup in soups]
    e1_words = [soup.find('e1').text.lower() for soup in soups]
    e2_words = [soup.find('e2').text.lower() for soup in soups]
    clean_text = [' '.join(text_to_word_sequence(clean)) for clean in clean_text]
    return [clean_text,e1_words,e2_words]

## Step 2: Remove all stopwords and punctuation
**Input :**  
`They saw that the equipment was put inside rollout drawers. 
[equipment, drawers]`

**Output :**  
`They saw that the equipment was put inside rollout drawers 
[equipment, drawers]`

## Step 3: Tokenize the words

**Input :**  They saw that the equipment was put inside rollout drawers. [equipment, drawers]

**Output :** 

`[23, 54, 65, 1, 1022, 55, 66, 65, 1156, 502] 
[1022, 502]`

All the words are given an index and are converted to numbers for easier processing

## Step 4: Make a one-hot-encoding of e1 and e2 words (Almost one-hot. There are some e1 and e2 phrases

**Input :** 

`[23, 54, 65, 1, 1022, 55, 66, 65, 1156, 502] 
[1022, 502]`

**Output :** 

`[23, 54, 65, 1, 1022, 55, 66, 65, 1156, 502] 
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, , 0, 0, 0, 0, 1]` 

In [63]:
def preprocess(params, tokenizer = tokenizer):  
    [clean_train, e1, e2] = params
    token_sentences = tokenizer.texts_to_sequences(clean_train)
    token_e1 = tokenizer.texts_to_sequences(e1)
    token_e2 = tokenizer.texts_to_sequences(e2)
    e1_one_hots = []
    for sentence,e1 in zip(token_sentences,token_e1):
        s1 = []
        for word in sentence:
            if word in e1:
                s1.append(1)
            else:
                s1.append(0)
        e1_one_hots.append(s1)

    e2_one_hots = []
    for sentence,e2 in zip(token_sentences,token_e2):
        s1 = []
        for word in sentence:
            if word in e2:
                s1.append(1)
            else:
                s1.append(0)
        e2_one_hots.append(s1)
    padded_sentences = pad(token_sentences)
    padded_e1 = pad(token_e1)
    padded_e2 = pad(token_e2)
    return [padded_sentences, padded_e1, padded_e2]

## Step 5: Pad all sequences to equal lengths (85 words)

**Input : **

`[23, 54, 65, 1, 1022, 55, 66, 65, 1156, 502] 
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, , 0, 0, 0, 0, 1]`

**Output :**

`[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 54, 65, 1, 1022, 55, 66, 65, 1156, 502]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]`

In [64]:
def pad(sequence):
    return keras.preprocessing.sequence.pad_sequences(sequence,maxlen=MAX_LEN)

## Step 6: Convert labels into one-hot-encoded vectors

**Input: **
`Cause-Effect`

**Output: **
`[1,0,0,0,0,0,0,0,0,0]
`

In [65]:
def preprocess_labels(target, label_index=label_index):
    reduced_labels = [t.replace('(e2,e1)','') for t in target]
    reduced_labels = [t.replace('(e1,e2)','') for t in reduced_labels]
    labels = [label_index[i] for i in reduced_labels]
    labels = keras.utils.to_categorical(labels)
    return labels

Let's run all this now. This might take some time depending on the performance of your system.

In [66]:
x_train = clean_text(x_train)
tokenizer.fit_on_texts(x_train[0])

x_train = preprocess(x_train)
x_test = preprocess(clean_text(x_test))

y_train = preprocess_labels(y_train)
y_test = preprocess_labels(y_test)

print("Train Data:")
print("Sentences: {}".format(x_train[0].shape))
print("Labels: {}".format(y_train.shape))
print()
print("Test Data:")
print("Sentences: {}".format(x_test[0].shape))
print("Labels: {}".format(y_train.shape))

Train Data:
Sentences: (8000, 85)
Labels: (8000, 10)

Test Data:
Sentences: (2717, 85)
Labels: (8000, 10)


# Data augmentation

In [76]:
np.unique(y_train,return_counts=1, axis=0)

(array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]),
 array([ 717, 1410,  634,  690,  504,  716,  845,  540,  941, 1003], dtype=int64))

# Word Embedding
Word embeddings convert words into vectors that give it semantic meaning. These vectors are dense representation of the words and make more sense to a machine learning program. [Here](https://www.tensorflow.org/tutorials/word2vec) is a great article to get started. 

**Word Embeddings are essential, for our model to perform well**


Let's get the GLOVE word embeddings Matrix as a txt file. Download it [here](https://drive.google.com/uc?export=download&id=0B30g1WfHiiY-MmN2dVVkdnV1S2M). Place it in the same directory as this notebook.

In [15]:
link = "https://drive.google.com/uc?export=download&id=0B30g1WfHiiY-MmN2dVVkdnV1S2M"
file_name = "glove.6B.100d.txt"

if not file_name in os.listdir('.'):
    print("File not found in the directory.\nPlease download %s" % link)
    print("Make sure to place it in the same directory as this notebook")
else:
    print("File Found. Loading word2vec matrix...")

embeddings_index = {}
f = open('glove.6B.100d.txt','rb')
for line in f:
    values = line.split()
    word = values[0].decode('utf-8')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print("Done.")

File Found. Loading word2vec matrix...
Done.


# The Model : Deep LSTM RNN Model

## Layer 1: Input
The first layer of our network will take one input : **The main sentence**

In [17]:
sentence_input = keras.layers.Input(shape=(MAX_LEN,), name="sentence_input")

## Layer 2: Word Embeddings
The sentence input will be converted to a vector based on the word2vec matrix that we obtained earlier

In [18]:
embed_layer = keras.layers.Embedding(len(word_index)+1,100,weights=[embedding_matrix],input_length=MAX_LEN, name="word2vec")
vector_sentence = embed_layer(sentence_input)

##  Auxillary Inputs

We will feed the e1 and e2 positional vectors to the word embeddings vectors

In [19]:
e1 = keras.layers.Input(shape=(MAX_LEN,), name="e1_input")
e1r = keras.layers.Reshape((MAX_LEN,1), name="e1_reshape")(e1)
e2 = keras.layers.Input(shape=(MAX_LEN,), name="e2_input")
e2r = keras.layers.Reshape((MAX_LEN,1), name="e2_reshape")(e2)

## Layer 3: Merge Layer
This concatanates the values from the the two e1 and e2 vectors as auxillary input with the word embedding vectors.

`merged layer = e1 +e2 + word embedding`

In [20]:
merged = keras.layers.concatenate([vector_sentence,e1r,e2r], name="merge")

## Layer 4: Dropout
To prevent overfitting, half of the incoming inputs will be sqashed to 0. Yeah. It's savage and it's true.

In [21]:
x = keras.layers.Dropout(0.5)(merged)

## Layer 5: LSTM Neurons 1
We added 128 LSTM neurons that returns requences into the next LSTM array

In [22]:
x = keras.layers.LSTM(128,return_sequences=True,name="LSTM_1")(x)

## Layer 6: Dropout
Oh but before that. Dropout.

In [23]:
x = keras.layers.Dropout(0.5)(x)

## Layer 7: LSTM Neurons 2
Another 32 LSTM neurons that recieve from the first LSTM layer

In [24]:
x = keras.layers.LSTM(32)(x)

## Layer 8: Dropout
Of course

In [25]:
x = keras.layers.Dropout(0.5)(x)

## Layer 9: Softmax
The final layer that outputs our prediction is a softmax layer with 10 neurons. One for each category.

In [26]:
prediction = keras.layers.Dense(10, activation='softmax', name="prediction")(x)

## Compile
Our model is ready. Let's compile it.

In [27]:
model = keras.models.Model([sentence_input,e1,e2],prediction)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
sentence_input (InputLayer)      (None, 85)            0                                            
____________________________________________________________________________________________________
e1_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
e2_input (InputLayer)            (None, 85)            0                                            
____________________________________________________________________________________________________
word2vec (Embedding)             (None, 85, 100)       1960600     sentence_input[0][0]             
___________________________________________________________________________________________

We have 2,099,810 trainable parameters. That's a lot of room for over-fitting.

## Let's train
We can start the training process. All the datasamples from the train folder are being used. 

The test samples are **not** used in the training process.

In [32]:
# Remove validation data
model.fit(x_train,y_train, validation_data=(x_test,y_test), epochs=20)

Train on 8000 samples, validate on 2717 samples
Epoch 1/20
Epoch 2/20
1024/8000 [==>...........................] - ETA: 58s - loss: 1.7887 - acc: 0.3604

KeyboardInterrupt: 

In [67]:
def format_predict(array):
    reverse_label = {v:k for k,v in label_index.items()}
    return reverse_label[np.argmax(prediction)]

In [70]:
a = 8000
for prediction in model.predict(x_test)[:50]:
    a+=1
    print(a,format_predict(prediction))

8001 Other
8002 Entity-Destination
8003 Other
8004 Entity-Destination
8005 Cause-Effect
8006 Component-Whole
8007 Other
8008 Other
8009 Content-Container
8010 Message-Topic
8011 Entity-Destination
8012 Product-Producer
8013 Entity-Destination
8014 Other
8015 Content-Container
8016 Other
8017 Other
8018 Other
8019 Member-Collection
8020 Other
8021 Message-Topic
8022 Content-Container
8023 Other
8024 Member-Collection
8025 Cause-Effect
8026 Entity-Destination
8027 Other
8028 Other
8029 Message-Topic
8030 Other
8031 Cause-Effect
8032 Message-Topic
8033 Component-Whole
8034 Cause-Effect
8035 Component-Whole
8036 Component-Whole
8037 Other
8038 Content-Container
8039 Cause-Effect
8040 Cause-Effect
8041 Message-Topic
8042 Component-Whole
8043 Message-Topic
8044 Entity-Destination
8045 Cause-Effect
8046 Content-Container
8047 Content-Container
8048 Content-Container
8049 Message-Topic
8050 Member-Collection


# Improvements
* Data augmentation to make the classes equal
* 

# References
1. http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
2. https://keras.io/getting-started/functional-api-guide/
3. https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html