In [71]:
import keras
from functools import reduce
import re
import numpy as np
import nltk
import simplejson as json
from pprint import pprint as pp

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import LSTM
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [72]:
def tokenize(sent):
    """Returns the tokens of a sequece"""
    tokens = nltk.word_tokenize(sent)
    tokens = [w.lower() for w in tokens]
    return tokens

In [73]:
with open('data/SciQ dataset/train.json', 'r') as rf:
    train = json.load(rf)
with open('data/SciQ dataset/test.json', 'r') as rf:
    test = json.load(rf)
with open('data/SciQ dataset/valid.json', 'r') as rf:
    valid = json.load(rf)

In [74]:
from random import shuffle

def preprocess(data_in):
    q = []
    s = []
    o = []
    for sample in data_in:
        question = sample['question']
        support = sample['support']
        option1 = (sample['distractor1'], 0)
        option2 = (sample['distractor2'], 0)
        option3 = (sample['distractor3'], 0)
        option4 = (sample['correct_answer'], 1)
        options = [option1, option2, option3, option4]
        shuffle(options)
        print(options)
        q.append(question)
        s.append(support)
        o.append(options)
    return q, s, o

In [75]:
preprocess(train[:3])

[('viruses', 0), ('mesophilic organisms', 1), ('protozoa', 0), ('gymnosperms', 0)]
[('muon effect', 0), ('centrifugal effect', 0), ('tropical effect', 0), ('coriolis effect', 1)]
[('reactive', 0), ('unbalanced', 0), ('exothermic', 1), ('endothermic', 0)]


(['What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
  'What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?',
  'Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what?'],
 ['Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.',
  'Without Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere. The winds blow northwest to sout

In [76]:
def createVocab(input_data):
    vocab_list = set()
    for sample in input_data:
        s_t = tokenize(sample['support'])
        q_t = tokenize(sample['question'])
        d1_t = tokenize(sample['distractor1'])
        d2_t = tokenize(sample['distractor2'])
        d3_t = tokenize(sample['distractor3'])
        a_t = tokenize(sample['correct_answer'])
        vocab_list |= set(s_t+q_t+d1_t+d2_t+d3_t+a_t)
    vocab_list=sorted(vocab_list)
    vocab_size = len(vocab_list)+3
    vocab = dict((c,i+2) for i,c in enumerate(vocab_list))
    print("Vocab ready")
    return vocab_list, vocab_size, vocab

In [77]:
def get_vectors(input_sent, vocab, vocab_list):
    tokenized = tokenize(input_sent)
    vectorized = []
    for w in tokenized:
        if w in vocab_list:
            vectorized.append(vocab[w])
        else:
            vectorized.append(vocab['UNK_ID'])
    return vectorized

In [78]:
def vectorize_input(data, vocab, vocab_size, support_maxlen, query_maxlen):
    qs = []
    sps = []
    ans = []
    dis1s = []
    dis2s = []
    dis3s = []
    dis4s = []
    for _,sample in enumerate(data):
        q_vect = get_vectors(sample['question'], vocab, vocab_list)
        support_vect = get_vectors(sample['support'], vocab , vocab_list)
        ans_tokens = tokenize(sample['correct_answer'])
        answer_vect = np.zeros(vocab_size)
        for w in ans_tokens:
            if w in vocab_list:
                answer_vect[vocab[w]]=1
            else:
                answer_vect[vocab['UNK_ID']]=1
                
        answer_vect = np.transpose(answer_vect)
                    
                                
        distractor1_vect = get_vectors(sample['distractor1'], vocab , vocab_list)
        distractor2_vect = get_vectors(sample['distractor2'], vocab , vocab_list)
        distractor3_vect = get_vectors(sample['distractor3'], vocab , vocab_list)
        distractor4_vect = get_vectors(sample['correct_answer'], vocab , vocab_list)
        qs.append(q_vect)
        sps.append(support_vect)
        ans.append(answer_vect)
        dis1s.append(distractor1_vect)
        dis2s.append(distractor2_vect)
        dis3s.append(distractor3_vect)
        dis4s.append(distractor4_vect)
    return(pad_sequences(qs, maxlen=query_maxlen),\
           pad_sequences(sps, maxlen=support_maxlen),\
           pad_sequences(dis1s, maxlen=query_maxlen),\
           pad_sequences(dis2s, maxlen=query_maxlen),\
           pad_sequences(dis3s, maxlen=query_maxlen),\
           pad_sequences(dis4s, maxlen=query_maxlen),\
           np.array(ans)
          )

In [79]:
#vectorize_input(train[:5], vocab, vocab_list, 500, 100)

In [80]:
EMBED_SIZE = 300
Q_HIDDEN_SIZE = 100
S_HIDDEN_SIZE = 300
BATCH_SIZE = 32
EPOCHS = 40
print('LSTM/EMBED/SUPPORT/QUERY={0},{1},{2},{3}'.format(LSTM,
                                                    EMBED_SIZE,
                                                    S_HIDDEN_SIZE,
                                                    Q_HIDDEN_SIZE))


LSTM/EMBED/SUPPORT/QUERY=<class 'keras.layers.recurrent.LSTM'>,300,300,100


In [81]:
vocab_list, vocab_size, vocab = createVocab(train+valid+test)

Vocab ready


In [82]:
q,s,d1,d2,d3, d4,a = vectorize_input(train[:3], vocab, vocab_size, S_HIDDEN_SIZE, Q_HIDDEN_SIZE)

In [83]:
support = layers.Input(shape=(S_HIDDEN_SIZE,), dtype='int32')
encoded_support = layers.Embedding(vocab_size, EMBED_SIZE)(support)
encoded_support = layers.Dropout(0.3)(encoded_support)
support_LSTM = LSTM(EMBED_SIZE)(encoded_support)
support_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(support_LSTM)

In [84]:
question = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
question_LSTM = LSTM(EMBED_SIZE)(encoded_question)
question_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(question_LSTM)

In [85]:
distractor1 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor1 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor1)
encoded_distractor1 = layers.Dropout(0.3)(encoded_distractor1)
distractor1_LSTM = LSTM(EMBED_SIZE)(encoded_distractor1)
distractor1_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(distractor1_LSTM)

In [86]:
distractor2 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor2 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor2)
encoded_distractor2 = layers.Dropout(0.3)(encoded_distractor2)
distractor2_LSTM = LSTM(EMBED_SIZE)(encoded_distractor2)
distractor2_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(distractor2_LSTM)

In [87]:
distractor3 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor3 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor3)
encoded_distractor3 = layers.Dropout(0.3)(encoded_distractor3)
distractor3_LSTM = LSTM(EMBED_SIZE)(encoded_distractor3)
distractor3_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(distractor3_LSTM)

In [88]:
distractor4 = layers.Input(shape=(Q_HIDDEN_SIZE,), dtype='int32')
encoded_distractor4 = layers.Embedding(vocab_size, EMBED_SIZE)(distractor4)
encoded_distractor4 = layers.Dropout(0.3)(encoded_distractor4)
distractor4_LSTM = LSTM(EMBED_SIZE)(encoded_distractor4)
distractor4_LSTM = layers.RepeatVector(S_HIDDEN_SIZE)(distractor4_LSTM)

In [89]:
MatchLSTM_layer = layers.add([support_LSTM, question_LSTM])
rankerLSTM_layer = layers.add([distractor1_LSTM, distractor2_LSTM, distractor3_LSTM, distractor4_LSTM])
merged = layers.add([MatchLSTM_layer , rankerLSTM_layer])
match_LSTM = LSTM(EMBED_SIZE)(merged)
match_LSTM = layers.Dropout(0.3)(match_LSTM)
predictions = layers.Dense(vocab_size, activation='softmax')(match_LSTM)
print(predictions)

TypeError: softmax() got an unexpected keyword argument 'axis'

In [37]:
model = Model([support, question], predictions)
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

NameError: name 'predictions' is not defined

In [None]:
print('Training')
model.fit([s,q], a,
         batch_size=BATCH_SIZE,
         epochs=EPOCHS)