In [1]:
import os
import json
import torch
import torchtext as text
import sys
import tqdm
from datasets import load_dataset
import pandas as pd
import string
from torchtext.data import get_tokenizer
from vocab import *
from utils import *
from constants import *
import pickle

from tqdm.notebook import tqdm

In [2]:
dataset = load_dataset("squad")

train_dic = {'passage': [], 'question': [], 'answer': []}
for i in range(len(dataset['train'])):
    datum = dataset['train'][i]
    for j in range(len(datum['answers']['text'])):
        train_dic['passage'].append(datum['context'])
        train_dic['question'].append(datum['question'])
        train_dic['answer'].append(datum['answers']['text'][j])

train = pd.DataFrame(train_dic)

val_dic = {'passage': [], 'question': [], 'answer': []}
for datum in dataset['validation']:
    for elem in datum['answers']['text']:
        ans_id = 0
        val_dic['passage'].append(datum['context'])
        val_dic['question'].append(datum['question'])
        val_dic['answer'].append(elem)

val = pd.DataFrame(val_dic)

Reusing dataset squad (/tmp/xdg-cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
for i in tqdm(range(len(train))):
    row = train.iloc[i]
    passage = clean_text(row['passage'].lower())
    question = clean_text(row['question'].lower())
    answer = clean_text(row['answer'].lower())
    
    train.iloc[i]['passage'] = passage
    train.iloc[i]['question'] = question
    train.iloc[i]['answer'] = answer
    
for i in tqdm(range(len(val))):
    row = val.iloc[i]
    passage = clean_text(row['passage'].lower())
    question = clean_text(row['question'].lower())
    answer = clean_text(row['answer'].lower())
    
    val.iloc[i]['passage'] = passage
    val.iloc[i]['question'] = question
    val.iloc[i]['answer'] = answer

  0%|          | 0/87599 [00:00<?, ?it/s]

  0%|          | 0/34726 [00:00<?, ?it/s]

In [4]:
#Split Val into val and test

val = val.sample(frac=1).reset_index(drop=True)
test = val[:10000]
val = val[10000:]

In [5]:
train.to_csv('./data/train.csv', index=False)
val.to_csv('./data/val.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [6]:
vocab = build_vocab()

Building Vocabulary
Saved the vocab.


In [7]:
vocab = load_vocab()

loaded vocab


In [9]:
vocab.word2idx

{'<pad>': 0,
 '<start>': 1,
 '<end>': 2,
 '<unk>': 3,
 'architecturally': 4,
 ',': 5,
 'the': 6,
 'school': 7,
 'has': 8,
 'a': 9,
 'catholic': 10,
 'character': 11,
 '.': 12,
 'atop': 13,
 'main': 14,
 'building': 15,
 "'": 16,
 's': 17,
 'gold': 18,
 'dome': 19,
 'is': 20,
 'golden': 21,
 'statue': 22,
 'of': 23,
 'virgin': 24,
 'mary': 25,
 'immediately': 26,
 'in': 27,
 'front': 28,
 'and': 29,
 'facing': 30,
 'it': 31,
 'copper': 32,
 'christ': 33,
 'with': 34,
 'arms': 35,
 'upraised': 36,
 'legend': 37,
 'venite': 38,
 'ad': 39,
 'me': 40,
 'omnes': 41,
 'next': 42,
 'to': 43,
 'basilica': 44,
 'sacred': 45,
 'heart': 46,
 'behind': 47,
 'grotto': 48,
 'marian': 49,
 'place': 50,
 'prayer': 51,
 'reflection': 52,
 'replica': 53,
 'at': 54,
 'lourdes': 55,
 'france': 56,
 'where': 57,
 'reputedly': 58,
 'appeared': 59,
 'saint': 60,
 'bernadette': 61,
 'soubirous': 62,
 '1858': 63,
 'end': 64,
 'drive': 65,
 '(': 66,
 'direct': 67,
 'line': 68,
 'that': 69,
 'connects': 70,
 'thr

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [18]:
embed_mat = np.random.randn(len(vocab), 50)
with open('glove.6B.50d.txt', 'r') as f:
    for line in f.readlines():
        info = line.split(' ')
        word = info[0]
        
        if word not in vocab.word2idx.keys():
            continue
        idx = vocab.word2idx[word]
        embed_mat[idx] = np.array(info[1:], dtype='float')

In [19]:
embed_mat

array([[-0.04179372,  0.14245786,  0.85440594, ...,  0.78135183,
         1.72925882,  1.06683433],
       [-0.6771004 ,  0.65312005, -0.48709166, ..., -0.36167298,
         1.05446974,  0.83227104],
       [ 0.12267943,  0.37359626,  0.98606022, ...,  1.57750448,
         1.50833715,  0.2324562 ],
       ...,
       [ 0.38266   , -1.0635    ,  0.88956   , ...,  0.2639    ,
        -0.48329   ,  0.32257   ],
       [-0.01372   , -0.68474   , -0.39111   , ...,  0.3543    ,
         0.52935   ,  0.63621   ],
       [-0.34231483, -0.05806565,  0.51640498, ...,  3.08573788,
        -0.25297485,  0.10809747]])

In [22]:
np.save('embeddings.npy', embed_mat)

In [None]:
# def get_processed_data_glove(df, tokenizer):
#         data = []
#         for idx in tqdm(range(len(df))):
#             pass_tokens = ['<start>'] + tokenizer(df.iloc[idx]["passage"]) + ['<end>']
#             ans_tokens = ['<start>'] + tokenizer(df.iloc[idx]["answer"]) + ['<end>']
#             q_tokens = ['<start>'] + tokenizer(df.iloc[idx]["question"]) + ['<end>']

#             pass_len = MAX_PASSAGE_LEN + 2 # +2 for start and end tokens
#             ans_len = MAX_ANSWER_LEN + 2
#             q_len = MAX_QUESTION_LEN + 2
            
#             pass_pad = ['<pad>'] * (MAX_PASSAGE_LEN + 2)
#             ans_pad = ['<pad>'] * (MAX_ANSWER_LEN + 2)
#             q_pad = ['<pad>'] * (MAX_QUESTION_LEN + 2)
            
#             pass_pad[:len(pass_tokens)] = pass_tokens
#             ans_pad[:len(ans_tokens)] = ans_tokens
#             q_pad[:len(q_tokens)] = q_tokens
            
#             passage_idxs = [vocab(word) for word in pass_pad]
#             answer_idxs = [vocab(word) for word in ans_pad]
#             question_idxs = [vocab(word) for word in q_pad]
            
#             passage_embed = torch.FloatTensor([vocab.embed_idx(elem) for elem in passage_idxs])
#             ans_embed = torch.FloatTensor([vocab.embed_idx(elem) for elem in answer_idxs])
#             q_embed = torch.FloatTensor([vocab.embed_idx(elem) for elem in question_idxs])
            
#             passage_idxs = torch.FloatTensor(passage_idxs)
#             answer_idxs = torch.FloatTensor(answer_idxs)
#             question_idxs = torch.FloatTensor(question_idxs)
            
#             data.append((passage_embed, ans_embed, q_embed, passage_idxs, answer_idxs, question_idxs))
#         return data

In [None]:
# tokenizer = get_tokenizer("basic_english")
# train_processed = get_processed_data_glove(train, tokenizer)
# val_processed = get_processed_data_glove(val, tokenizer)
# test_processed = get_processed_data_glove(test, tokenizer)

In [24]:
def get_processed_data(df, tokenizer):
        data = []
        for idx in tqdm(range(len(df))):
                pass_tokens = ['<start>'] + tokenizer(df.iloc[idx]["passage"]) + ['<end>']
                ans_tokens = ['<start>'] + tokenizer(df.iloc[idx]["answer"]) + ['<end>']
                q_tokens = ['<start>'] + tokenizer(df.iloc[idx]["question"]) + ['<end>']
                # pass_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["passage"])) + ['<end>']
                # ans_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["answer"])) + ['<end>']
                # q_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["question"])) + ['<end>']

                pass_len = MAX_PASSAGE_LEN + 2 # +2 for start and end tokens
                ans_len = MAX_ANSWER_LEN + 2
                q_len = MAX_QUESTION_LEN + 2

                passage = [vocab(word) for word in pass_tokens]
                answer = [vocab(word) for word in ans_tokens]
                question = [vocab(word) for word in q_tokens]

                # padding to same length
                pass_idxs = torch.zeros(pass_len)
                ans_idxs = torch.zeros(ans_len)
                q_idxs = torch.zeros(q_len)

                pass_idxs[:len(passage)] = torch.FloatTensor(passage)
                ans_idxs[:len(answer)] = torch.FloatTensor(answer)
                q_idxs[:len(question)] = torch.FloatTensor(question)

                data.append((pass_idxs, ans_idxs, q_idxs))
        return data

In [25]:
tokenizer = get_tokenizer("basic_english")
train_processed = get_processed_data(train, tokenizer)
val_processed = get_processed_data(val, tokenizer)
test_processed = get_processed_data(test, tokenizer)

  0%|          | 0/87599 [00:00<?, ?it/s]

  0%|          | 0/24726 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [26]:
with open('./data/train_processed.pickle', 'wb') as train_file:
    pickle.dump(train_processed, train_file)

with open('./data/val_processed.pickle', 'wb') as val_file:
    pickle.dump(val_processed, val_file)

with open('./data/test_processed.pickle', 'wb') as test_file:
    pickle.dump(test_processed, test_file)