In [1]:
import json
import nltk
import numpy as np

import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
with open('data/SciQ dataset/train.json', 'r') as rf:
    train = json.load(rf)

In [3]:
def tokenize(sequence):
    tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
#     return list(map(lambda x:x.encode('utf8'), tokens))
    return tokens

In [4]:
## Glove stuff - Pass pretrained embeddings as input
filename = 'data/glove.6B.50d.txt'
def loadGloVe(filename):
    rev_vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        rev_vocab.append(row[0])
        embd.append(row[1:])
    print('Loaded GloVe!')
    file.close()
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab,rev_vocab,embd
# vocab,rev_vocab,embd = loadGloVe(filename)
# vocab_size = len(vocab)
# embedding_dim = len(embd[0])
# embedding = np.asarray(embd)


In [5]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return np.array(sequence_padded), np.array(sequence_length)

def pad_sequences(sequences, pad_tok):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    max_length = max([len(x) for x in sequences])
    sequence_padded, sequence_length = _pad_sequences(sequences, 
                                            pad_tok, max_length)

    return sequence_padded, sequence_length

In [9]:
supportTokens = []
questionTokens = []
distractor1Tokens = []
distractor2Tokens = []
distractor3Tokens = []
question_embed = []
UNK_ID = 2


class Pipeline:
    
    def __init__(self):
        self.vocab,self.rev_vocab,embd = loadGloVe(filename)
        self.vocab_size = len(self.vocab)
        self.embedding_dim = len(embd[0])
        self.embedding = np.asarray(embd)
        self.setup_placeholders()
    
    def setup_placeholders(self):
        self.question_ids = tf.placeholder(tf.int32, shape = [None, None], name = "question_ids")
        self.support_ids = tf.placeholder(tf.int32, shape = [None, None], name = "support_ids")
        self.distractor1_ids = tf.placeholder(tf.int32, shape = [None, None], name = "distractor1_ids")
        self.distractor2_ids = tf.placeholder(tf.int32, shape = [None, None], name = "distractor2_ids")
        self.distractor3_ids = tf.placeholder(tf.int32, shape = [None, None], name = "distractor3_ids")
        self.answer_ids = tf.placeholder(tf.int32, shape = [None, None], name = "answer_ids")

        self.question_lengths = tf.placeholder(tf.int32, shape=[None], name="question_lengths")
        self.support_lengths = tf.placeholder(tf.int32, shape = [None], name = "support_lengths")
        self.distractor1_lengths = tf.placeholder(tf.int32, shape = [None], name = "distractor1_lengths")
        self.distractor2_lengths = tf.placeholder(tf.int32, shape = [None], name = "distractor2_lengths")
        self.distractor3_lengths = tf.placeholder(tf.int32, shape = [None], name = "distractor3_lengths")
        self.answer_lengths = tf.placeholder(tf.int32, shape = [None], name = "answer_lengths")
        
        self.dropout = tf.placeholder(tf.float32, shape=[], name = "dropout")
    
    def get_feed_dict(self, input_data, dropout_val = 1.0):
        """ Convert input data to list of ids for each sample."""
        _support_ids = []
        _question_ids = []
        _distractor1_ids = []
        _distractor2_ids = []
        _distractor3_ids = []
        _answer_ids = []
        
        for index in range (len(input_data)):
            supportTokens = tokenize(train[index]['support'])
            questionTokens = tokenize(train[index]['question'])
            distractor1Tokens = tokenize(train[index]['distractor1'])
            distractor2Tokens = tokenize(train[index]['distractor2'])
            distractor3Tokens = tokenize(train[index]['distractor3'])
            answerTokens = tokenize(train[index]['correct_answer'])

            _support_ids.append([str(self.vocab.get(w, UNK_ID)) for w in supportTokens])
            _question_ids.append([str(self.vocab.get(w, UNK_ID)) for w in questionTokens])
            _distractor1_ids.append([str(self.vocab.get(w, UNK_ID)) for w in distractor1Tokens])
            _distractor2_ids.append([str(self.vocab.get(w, UNK_ID)) for w in distractor2Tokens])
            _distractor3_ids.append([str(self.vocab.get(w, UNK_ID)) for w in distractor3Tokens])
            _answer_ids.append([str(self.vocab.get(w, UNK_ID)) for w in answerTokens])
        
        ## Padding
        padded_questions, question_lengths = pad_sequences(_question_ids, 0)
        padded_support, support_lengths = pad_sequences(_support_ids, 0)
        padded_distractor1, distractor1_lengths = pad_sequences(_distractor1_ids, 0)
        padded_distractor2, distractor2_lengths = pad_sequences(_distractor2_ids, 0)
        padded_distractor3, distractor3_lengths = pad_sequences(_distractor3_ids, 0)
        padded_answer, answer_lengths = pad_sequences(_answer_ids, 0)

        feed = {
            self.question_ids:padded_questions, self.question_lengths:question_lengths,
            self.support_ids:padded_support, self.support_lengths:support_lengths,
            self.distractor1_ids:padded_distractor1, self.distractor1_lengths:distractor1_lengths,
            self.distractor2_ids:padded_distractor1, self.distractor2_lengths:distractor2_lengths,
            self.distractor3_ids:padded_distractor1, self.distractor3_lengths:distractor3_lengths,
            self.answer_ids:padded_answer, self.answer_lengths:answer_lengths
        }
        
        return feed
    
    def setup_word_embeddings(self):
        '''
            Create an embedding matrix (initialised with pretrained glove vectors and updated only if self.config.train_embeddings is true)
            lookup into this matrix and apply dropout (which is 1 at test time and self.config.dropout at train time)
        '''
        with tf.variable_scope("vocab_embeddings"):
            question_emb = tf.nn.embedding_lookup(self.embedding, self.question_ids, name = "question") # (-1, Q, D)
            support_emb = tf.nn.embedding_lookup(self.embedding, self.support_ids, name = "support") # (-1, P, D)
            distractor1_emb = tf.nn.embedding_lookup(self.embedding, self.distractor1_ids, name = "distractor1") # (-1, P, D)
            distractor2_emb = tf.nn.embedding_lookup(self.embedding, self.distractor2_ids, name = "distractor2")
            distractor3_emb = tf.nn.embedding_lookup(self.embedding, self.distractor3_ids, name = "distractor3")
            answer_emb = tf.nn.embedding_lookup(self.embedding, self.answer_ids, name = "answer")
            # Apply dropout
            self.question = tf.nn.dropout(question_emb, self.dropout)
            self.support = tf.nn.dropout(support_emb, self.dropout)
            self.distractor1 = tf.nn.dropout(distractor1_emb, self.dropout)
            self.distractor2 = tf.nn.dropout(distractor2_emb, self.dropout)
            self.distractor3 = tf.nn.dropout(distractor3_emb, self.dropout)
            self.answer = tf.nn.dropout(answer_emb, self.dropout)
    
    

In [10]:
pipe = Pipeline()

Loaded GloVe!


In [11]:
feed = pipe.get_feed_dict(train[:1])
print(feed)

{<tf.Tensor 'question_ids_1:0' shape=(?, ?) dtype=int32>: array([['2', '1554', '3', '18181', '14', '3659', '180', '6', '4789', '3',
        '5008', '125', '19', '5795', '5', '19302', '188']], dtype='<U5'), <tf.Tensor 'question_lengths_1:0' shape=(?,) dtype=int32>: array([17]), <tf.Tensor 'support_ids_1:0' shape=(?, ?) dtype=int32>: array([['2', '2274', '254', '6', '3230', '3987', '1', '2681', '118', '2',
        '5', '2', '23', '2', '5', '2', '24', '2', '2', '32', '456',
        '238', '756', '6', '46', '13', '0', '1741', '3', '3790', '46',
        '68', '2430', '2', '2', '17665', '552', '3987', '3', '109',
        '33598', '2', '14', '2', '23', '2', '24', '1', '0', '1973',
        '473', '719', '3987', '2', '2', '12290', '33', '481', '2054',
        '6', '565', '4789', '1', '144', '5795', '1', '19302', '1',
        '4178', '5', '2760', '2']], dtype='<U5'), <tf.Tensor 'support_lengths_1:0' shape=(?,) dtype=int32>: array([72]), <tf.Tensor 'distractor1_ids_1:0' shape=(?, ?) dtype=int32>:

In [None]:
pipe.setup_word_embeddings()

  if params is None or params in ((), []):


In [None]:

question_embed.append(tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(question_ids)))))
#     support_embed.append(tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(support_ids)))))
#     answer_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(answerTokens[index]))))
#     distractor1_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor1Tokens[index]))))
#     distractor2_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor2Tokens[index]))))
#     distractor3_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor3Tokens[index]))))

In [67]:
some_list = tokenize(train[0]['support'])
print(some_list)

['Mesophiles', 'grow', 'best', 'in', 'moderate', 'temperature', ',', 'typically', 'between', '25°C', 'and', '40°C', '(', '77°F', 'and', '104°F', ')', '.', 'Mesophiles', 'are', 'often', 'found', 'living', 'in', 'or', 'on', 'the', 'bodies', 'of', 'humans', 'or', 'other', 'animals', '.', 'The', 'optimal', 'growth', 'temperature', 'of', 'many', 'pathogenic', 'mesophiles', 'is', '37°C', '(', '98°F', ')', ',', 'the', 'normal', 'human', 'body', 'temperature', '.', 'Mesophilic', 'organisms', 'have', 'important', 'uses', 'in', 'food', 'preparation', ',', 'including', 'cheese', ',', 'yogurt', ',', 'beer', 'and', 'wine', '.']


In [None]:
from pprint import pprint as pp
sess = tf.Session()
W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]),
                trainable=False, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
embedding_init = W.assign(embedding_placeholder)
sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})

In [None]:
# 
from tensorflow.contrib import learn
#init vocab processor
vocab_processor = learn.preprocessing.VocabularyProcessor(len(train))
#fit the vocab from glove
pretrain = vocab_processor.fit(rev_vocab)
#transform inputs
for index in range (len(train)):
    question_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(questionTokens[index]))))
    support_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(supportTokens[index]))))
    answer_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(answerTokens[index]))))
    distractor1_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor1Tokens[index]))))
    distractor2_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor2Tokens[index]))))
    distractor3_embed = tf.nn.embedding_lookup(W, np.array(list(vocab_processor.transform(distractor3Tokens[index]))))

In [13]:
## Pass tokens (question/answer/support) as input

In [8]:
import tensorflow as tf
dataset = tf.data.Dataset

SyntaxError: invalid syntax (<ipython-input-8-3ea2b6e15d03>, line 2)