In [1]:
from matplotlib import pyplot as plt
from keras.models import model_from_json
import json

import keras
from functools import reduce
import re
import numpy as np
import nltk
import json
from pprint import pprint as pp

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import LSTM
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model

Using TensorFlow backend.


In [8]:
json_file = open('Model_Files/model_2_epochs_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("Model_Files/model_2_epochs.h5")
print("Loaded model from disk")

Loaded model from disk


In [9]:
def tokenize(sent):
    """Returns the tokens of a sequece"""
    tokens = nltk.word_tokenize(sent)
    tokens = [w.lower() for w in tokens]
    return tokens

In [10]:
with open('data/SciQ dataset/train.json', 'r') as rf:
    train = json.load(rf)
with open('data/SciQ dataset/test.json', 'r') as rf:
    test = json.load(rf)
with open('data/SciQ dataset/valid.json', 'r') as rf:
    valid = json.load(rf)

In [11]:
from random import shuffle

def preprocess(data_in):
    q = []
    s = []
    o = []
    l = []
    for sample in data_in:
        question = sample['question']
        support = sample['support']
        option1 = (sample['distractor1'], -1)
        option2 = (sample['distractor2'], -1)
        option3 = (sample['distractor3'], -1)
        option4 = (sample['correct_answer'], 1)
        options = [option1, option2, option3, option4]
        shuffle(options)
        q.append(question)
        s.append(support)
        o.append(tuple(op for op,_ in options))
        l.append(tuple(label for _, label in options))
    X = {'questions': q, 'support': s, 'options': o}
    return X, l

In [12]:
def get_vectors(input_sent, vocab, vocab_list):
    tokenized = tokenize(input_sent)
    vectorized = []
    for w in tokenized:
        if w in vocab_list:
            vectorized.append(vocab[w])
        else:
            vectorized.append(vocab['UNK_ID'])
    return vectorized

In [13]:
def vectorize_input(X, y, vocab, vocab_size, support_maxlen, query_maxlen):
    op1 = []
    op2 = []
    op3 = []
    op4 = []
    l1 = []; l2 = []; l3 = []; l4 = []
    for label_list in y:
        l1.append(label_list[0])
        l2.append(label_list[1])
        l3.append(label_list[2])
        l4.append(label_list[3])
    labels = [np.array(l1),np.array(l2),np.array(l3),np.array(l4)]
    qs = [get_vectors(sent, vocab, vocab_list) for sent in X['questions']]
    sps = [get_vectors(sent, vocab, vocab_list) for sent in X['support']]
    for sample_options in X['options']:
        op1.append(get_vectors(sample_options[0], vocab, vocab_list))
        op2.append(get_vectors(sample_options[1], vocab, vocab_list))
        op3.append(get_vectors(sample_options[2], vocab, vocab_list))
        op4.append(get_vectors(sample_options[3], vocab, vocab_list))
    return(pad_sequences(qs, maxlen=query_maxlen),\
           pad_sequences(sps, maxlen=support_maxlen),\
           pad_sequences(op1, maxlen=query_maxlen),\
           pad_sequences(op2, maxlen=query_maxlen),\
           pad_sequences(op3, maxlen=query_maxlen),\
           pad_sequences(op4, maxlen=query_maxlen),\
           labels
          )

In [14]:
def createVocab(input_data):
    vocab_list = set()
    for sample in input_data:
        s_t = tokenize(sample['support'])
        q_t = tokenize(sample['question'])
        d1_t = tokenize(sample['distractor1'])
        d2_t = tokenize(sample['distractor2'])
        d3_t = tokenize(sample['distractor3'])
        a_t = tokenize(sample['correct_answer'])
        vocab_list |= set(s_t+q_t+d1_t+d2_t+d3_t+a_t)
    vocab_list=sorted(vocab_list)
    vocab_size = len(vocab_list)+3
    vocab = dict((c,i+2) for i,c in enumerate(vocab_list))
    print("Vocab ready")
    return vocab_list, vocab_size, vocab
vocab_list, vocab_size, vocab = createVocab(train+valid+test)

Vocab ready


In [15]:
EMBED_SIZE = 300
Q_HIDDEN_SIZE = 100
S_HIDDEN_SIZE = 300
BATCH_SIZE = 32
EPOCHS = 40
X_valid, y_valid = preprocess(valid)
vq,vs,vd1,vd2,vd3,vd4,va = vectorize_input(X_valid, y_valid, vocab, vocab_size, S_HIDDEN_SIZE, Q_HIDDEN_SIZE)
p_op1,p_op2, p_op3, p_op4 = model.predict([vs,vq, vd1,vd2,vd3,vd4])

In [16]:
predictions = [[o1,o2,o3,o4] for o1,o2,o3,o4 in zip(p_op1,p_op2,p_op3,p_op4)]

In [17]:
def threshold_acc(predictions, y_valid):
    correct_count = 0
    threshold = 0
    for sample_ind, sample_y in enumerate(y_valid):
        for ind,op in enumerate(sample_y):
            if op==1 and predictions[sample_ind][ind]>threshold:
                correct_count+=1
    return 100*correct_count/float(len(y_valid))
print(threshold_acc(predictions,y_valid))

0.0


In [18]:
def get_predicted_labels(predictions):
    predicted_labels = []
    for pred in predictions:
        ans_labels = [-1]*4
        ans_label_ind = pred.index(max(pred))
        ans_labels[ans_label_ind]=1
        predicted_labels.append(tuple(ans_labels))
    return predicted_labels

In [19]:
def highest_val_correct_acc(predictions, y_valid):
    correct_count = 0
    y_pred = get_predicted_labels(predictions)
    for y_t, y_p in zip(y_valid, y_pred):
        if y_t==y_p:
            correct_count+=1
    return 100*correct_count/float(len(y_valid))
print(highest_val_correct_acc(predictions, y_valid))

25.8


In [None]:
X, y = preprocess(train)
q,s,d1,d2,d3,d4,a = vectorize_input(X, y, vocab, vocab_size, S_HIDDEN_SIZE, Q_HIDDEN_SIZE)
p_op1,p_op2, p_op3, p_op4 = model.predict([s,q,d1,d2,d3,d4])