In [None]:
import keras
from functools import reduce
import re
import numpy as np
import nltk
import json
from pprint import pprint as pp

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import LSTM
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model

In [2]:
def tokenize(sent):
    """Returns the tokens of a sequece"""
    tokens = nltk.word_tokenize(sent)
    tokens = [w.lower() for w in tokens]
    return tokens

In [3]:
with open('data/SciQ dataset/train.json', 'r') as rf:
    train = json.load(rf)
with open('data/SciQ dataset/test.json', 'r') as rf:
    test = json.load(rf)
with open('data/SciQ dataset/valid.json', 'r') as rf:
    valid = json.load(rf)

In [4]:
from random import shuffle

def preprocess(data_in):
    q = []
    s = []
    o = []
    l = []
    for sample in data_in:
        question = sample['question']
        support = sample['support']
        option1 = (sample['distractor1'], -1)
        option2 = (sample['distractor2'], -1)
        option3 = (sample['distractor3'], -1)
        option4 = (sample['correct_answer'], 1)
        options = [option1, option2, option3, option4]
        shuffle(options)
        q.append(question)
        s.append(support)
        o.append(tuple(op for op,_ in options))
        l.append(tuple(label for _, label in options))
    X = {'questions': q, 'support': s, 'options': o}
    return X, l

In [7]:
def createVocab(input_data):
    vocab_list = set()
    for sample in input_data:
        s_t = tokenize(sample['support'])
        q_t = tokenize(sample['question'])
        d1_t = tokenize(sample['distractor1'])
        d2_t = tokenize(sample['distractor2'])
        d3_t = tokenize(sample['distractor3'])
        a_t = tokenize(sample['correct_answer'])
        vocab_list |= set(s_t+q_t+d1_t+d2_t+d3_t+a_t)
    vocab_list=sorted(vocab_list)
    vocab_size = len(vocab_list)+3
    vocab = dict((c,i+2) for i,c in enumerate(vocab_list))
    print("Vocab ready")
    return vocab_list, vocab_size, vocab

In [8]:
vocab_list, vocab_size, vocab = createVocab(train+valid+test)

Vocab ready


In [9]:
def get_vectors(input_sent, vocab, vocab_list):
    tokenized = tokenize(input_sent)
    vectorized = []
    for w in tokenized:
        if w in vocab_list:
            vectorized.append(vocab[w])
        else:
            vectorized.append(vocab['UNK_ID'])
    return vectorized

In [10]:
def vectorize_input(X, y, vocab, vocab_size, support_maxlen, query_maxlen):
    op1 = []
    op2 = []
    op3 = []
    op4 = []
    labels = [np.asarray(l_list) for l_list in y]
    qs = [get_vectors(sent, vocab, vocab_list) for sent in X['questions']]
    sps = [get_vectors(sent, vocab, vocab_list) for sent in X['support']]
    for sample_options in X['options']:
        op1.append(get_vectors(sample_options[0], vocab, vocab_list))
        op2.append(get_vectors(sample_options[1], vocab, vocab_list))
        op3.append(get_vectors(sample_options[2], vocab, vocab_list))
        op4.append(get_vectors(sample_options[3], vocab, vocab_list))
    return(pad_sequences(qs, maxlen=query_maxlen),\
           pad_sequences(sps, maxlen=support_maxlen),\
           pad_sequences(op1, maxlen=query_maxlen),\
           pad_sequences(op2, maxlen=query_maxlen),\
           pad_sequences(op3, maxlen=query_maxlen),\
           pad_sequences(op4, maxlen=query_maxlen),\
           pad_sequences(labels, 4)
          )

In [11]:
EMBED_SIZE = 300
Q_HIDDEN_SIZE = 100
S_HIDDEN_SIZE = 300
BATCH_SIZE = 32
EPOCHS = 40
print('LSTM/EMBED/SUPPORT/QUERY={0},{1},{2},{3}'.format(LSTM,
                                                    EMBED_SIZE,
                                                    S_HIDDEN_SIZE,
                                                    Q_HIDDEN_SIZE))


LSTM/EMBED/SUPPORT/QUERY=<class 'keras.layers.recurrent.LSTM'>,300,300,100


In [12]:
support = layers.Input(shape=(S_HIDDEN_SIZE,), dtype='int32', name='support_input')
encoded_support = layers.Embedding(vocab_size, EMBED_SIZE)(support)
encoded_support = layers.Dropout(0.3)(encoded_support)
support_LSTM = LSTM(EMBED_SIZE)(encoded_support)
support_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(support_LSTM)

In [13]:
question = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32', name='question_input')
encoded_question = layers.Embedding(vocab_size, EMBED_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
question_LSTM = LSTM(EMBED_SIZE)(encoded_question)
question_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(question_LSTM)

In [14]:
distractor1 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32', name='d1_input')
encoded_distractor1 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor1)
encoded_distractor1 = layers.Dropout(0.3)(encoded_distractor1)
distractor1_LSTM = LSTM(EMBED_SIZE)(encoded_distractor1)

In [15]:
distractor2 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor2 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor2)
encoded_distractor2 = layers.Dropout(0.3)(encoded_distractor2)
distractor2_LSTM = LSTM(EMBED_SIZE)(encoded_distractor2)

In [16]:
distractor3 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor3 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor3)
encoded_distractor3 = layers.Dropout(0.3)(encoded_distractor3)
distractor3_LSTM = LSTM(EMBED_SIZE)(encoded_distractor3)

In [17]:
distractor4 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor4 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor4)
encoded_distractor4 = layers.Dropout(0.3)(encoded_distractor4)
distractor4_LSTM = LSTM(EMBED_SIZE)(encoded_distractor4)

In [18]:
MatchLSTM_layer = layers.add([support_LSTM, question_LSTM])
option1_match = layers.add([MatchLSTM_layer, distractor1_LSTM])
option2_match = layers.add([MatchLSTM_layer, distractor2_LSTM])
option3_match = layers.add([MatchLSTM_layer, distractor3_LSTM])
option4_match = layers.add([MatchLSTM_layer, distractor4_LSTM])
merged = layers.add([option1_match , option2_match, option3_match, option4_match])
ranker_LSTM = LSTM(EMBED_SIZE)(merged)
ranker_LSTM = layers.Dropout(0.3)(ranker_LSTM)
predictions = layers.Dense(4, activation='softmax')(ranker_LSTM)
print(predictions)

Tensor("dense_1/Softmax:0", shape=(?, 4), dtype=float32)


In [19]:
print(predictions)

Tensor("dense_1/Softmax:0", shape=(?, 4), dtype=float32)


In [20]:
model = Model([support, question, distractor1, distractor2, distractor3, distractor4], predictions)
print(model.summary())
plot_model(model, to_file='Final_Model.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
support_input (InputLayer)      (None, 300)          0                                            
__________________________________________________________________________________________________
question_input (InputLayer)     (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     9234600     support_input[0][0]              
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 300)     9234600     question_input[0][0]             
__________________________________________________________________________________________________
dropout_1 

NameError: name 'plot_model' is not defined

In [21]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
print('Training')
X,y = preprocess(train)
q,s,d1,d2,d3, d4,a = vectorize_input(X, y, vocab, vocab_size, S_HIDDEN_SIZE, Q_HIDDEN_SIZE)

model.fit([s,q, d1,d2,d3,d4], a,
         batch_size=BATCH_SIZE,
         epochs=EPOCHS)

Training


In [None]:
X_valid, y_valid = preprocess(valid)
vq,vs,vd1,vd2,vd3, vd4, va = vectorize_input(X_valid, y_valid, vocab, vocab_size, S_HIDDEN_SIZE, Q_HIDDEN_SIZE)
loss, acc = model.evaluate([vs,vq,vd1,vd2,vd3, vd4], va,
                         batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))