In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
keys = ['train', 'test']

In [3]:
import json

In [4]:
data = {k:[] for k in keys}
for k in keys :
    for line in open('SNLI/snli_1.0/snli_1.0_' + k + '.jsonl').readlines() :
        data[k].append(json.loads(line))

In [5]:
from vectorizer import cleaner

In [6]:
p, q, a = {}, {}, {}

for k in keys :
    p[k] = [cleaner(x['sentence1']) for x in data[k] if x['gold_label'] != '-']
    q[k] = [cleaner(x['sentence2']) for x in data[k] if x['gold_label'] != '-']
    a[k] = [cleaner(x['gold_label']) for x in data[k] if x['gold_label'] != '-']

In [7]:
from vectorizer import Vectorizer

In [8]:
vec = Vectorizer(min_df=3)

In [9]:
vec.fit(p['train'] + q['train'])

In [10]:
vec.vocab_size

20982

In [11]:
vec.entity2idx = {}
i = 0
for w in set(a['test']) :
    vec.entity2idx[w] = i
    i += 1
    
vec.idx2entity = {v:k for k, v in vec.entity2idx.items()}

In [12]:
def generate_label_and_filter(a) :
    entities = np.ones((len(vec.entity2idx),))
    return entities, vec.entity2idx[a]

In [13]:
vec.paragraphs = {}
vec.questions = {}
vec.entity_masks = {}
vec.answers = {}

for k in keys :
    vec.paragraphs[k] = vec.texts_to_sequences(p[k])
    vec.questions[k] = vec.texts_to_sequences(q[k])
    vec.entity_masks[k] = []
    vec.answers[k] = []
    for i in range(len(a[k])) :
        mask, answer = generate_label_and_filter(a[k][i])
        vec.entity_masks[k].append(mask)
        vec.answers[k].append(answer)

In [14]:
from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
vectors = Vectors('glove.840B.300d.txt', url=url)

In [15]:
vec.word_dim = vectors.dim
import numpy as np
vec.embeddings = np.zeros((len(vec.idx2word), vec.word_dim))
for i, word in vec.idx2word.items() :
    vec.embeddings[i] = vectors[word].numpy()

In [16]:
import pickle
pickle.dump(vec, open('SNLI/vec_snli.p', 'wb'))