In [1]:
import os
import json
import torch
import torchtext as text
import sys
import tqdm
from datasets import load_dataset
import pandas as pd
import string
from torchtext.data import get_tokenizer
from vocab import *
from utils import *
from constants import *
import pickle

from tqdm.notebook import tqdm

In [2]:
dataset = load_dataset("squad")

train_dic = {'passage': [], 'question': [], 'answer': []}
for i in range(len(dataset['train'])):
    datum = dataset['train'][i]
    for j in range(len(datum['answers']['text'])):
        train_dic['passage'].append(datum['context'])
        train_dic['question'].append(datum['question'])
        train_dic['answer'].append(datum['answers']['text'][j])

train = pd.DataFrame(train_dic)

val_dic = {'passage': [], 'question': [], 'answer': []}
for datum in dataset['validation']:
    for elem in datum['answers']['text']:
        ans_id = 0
        val_dic['passage'].append(datum['context'])
        val_dic['question'].append(datum['question'])
        val_dic['answer'].append(elem)

val = pd.DataFrame(val_dic)

Reusing dataset squad (/tmp/xdg-cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

## Regular Data

In [None]:
for i in tqdm(range(len(train))):
    row = train.iloc[i]
    passage = clean_text(row['passage'].lower())
    question = clean_text(row['question'].lower())
    answer = clean_text(row['answer'].lower())
    
    train.iloc[i]['passage'] = passage
    train.iloc[i]['question'] = question
    train.iloc[i]['answer'] = answer
    
for i in tqdm(range(len(val))):
    row = val.iloc[i]
    passage = clean_text(row['passage'].lower())
    question = clean_text(row['question'].lower())
    answer = clean_text(row['answer'].lower())
    
    val.iloc[i]['passage'] = passage
    val.iloc[i]['question'] = question
    val.iloc[i]['answer'] = answer

In [None]:
#Split Val into val and test

val = val.sample(frac=1).reset_index(drop=True)
test = val[:10000]
val = val[10000:]

train.to_csv('./data/train.csv', index=False)
val.to_csv('./data/val.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [None]:
vocab = build_vocab()

In [None]:
vocab = load_vocab()

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

## Building Embedding Matrix

In [None]:
embed_mat = np.random.randn(len(vocab), 50)
with open('glove.6B.50d.txt', 'r') as f:
    for line in f.readlines():
        info = line.split(' ')
        word = info[0]
        
        if word not in vocab.word2idx.keys():
            continue
        idx = vocab.word2idx[word]
        embed_mat[idx] = np.array(info[1:], dtype='float')

In [None]:
embed_mat

In [None]:
np.save('embeddings.npy', embed_mat)

#### Method for using masked passages to indicate answer presence

In [None]:
def get_processed_data_mask(df, tokenizer):
        data = []
        for idx in tqdm(range(len(df))):
            pass_tokens = ['<start>'] + tokenizer(df.iloc[idx]["passage"]) + ['<end>']
            ans_tokens = ['<start>'] + tokenizer(df.iloc[idx]["answer"]) + ['<end>']
            q_tokens = ['<start>'] + tokenizer(df.iloc[idx]["question"]) + ['<end>']
            
            pass_len = MAX_PASSAGE_LEN + 2 # +2 for start and end tokens
            ans_len = MAX_ANSWER_LEN + 2
            q_len = MAX_QUESTION_LEN + 2
            
            pass_mask = torch.zeros(pass_len)
            temp_pass = tokenizer(df.iloc[idx]['passage'])
            temp_ans = tokenizer(df.iloc[idx]['answer'])
            for i in range(len(temp_pass)):
                elem = temp_pass[i]
                if elem in temp_ans:
                    pass_mask[i+1] = 1 # shift by 1 for start token
            pass_mask = pass_mask.float()

            passage = [vocab(word) for word in pass_tokens]
            answer = [vocab(word) for word in ans_tokens]
            question = [vocab(word) for word in q_tokens]

            # padding to same length
            pass_idxs = torch.zeros(pass_len)
            ans_idxs = torch.zeros(ans_len)
            q_idxs = torch.zeros(q_len)

            pass_idxs[:len(passage)] = torch.FloatTensor(passage)
            ans_idxs[:len(answer)] = torch.FloatTensor(answer)
            q_idxs[:len(question)] = torch.FloatTensor(question)

            data.append((pass_idxs, ans_idxs, q_idxs, pass_mask))
        return data

In [None]:
tokenizer = get_tokenizer("basic_english")

# train_processed = get_processed_data_mask(train, tokenizer)
# with open('./data/train_processed_mask.pickle', 'wb') as train_file:
#     pickle.dump(train_processed, train_file)

val = pd.read_csv('./data/val.csv')
val_processed = get_processed_data_mask(val, tokenizer)
with open('./data/val_processed_mask.pickle', 'wb') as val_file:
    pickle.dump(val_processed, val_file)


test = pd.read_csv('./data/test.csv')
test_processed = get_processed_data_mask(test, tokenizer)
with open('./data/test_processed_mask.pickle', 'wb') as test_file:
    pickle.dump(test_processed, test_file)

## Save Pickle File for Regular Data

In [None]:
def get_processed_data(df, tokenizer):
        data = []
        for idx in tqdm(range(len(df))):
                pass_tokens = ['<start>'] + tokenizer(df.iloc[idx]["passage"]) + ['<end>']
                ans_tokens = ['<start>'] + tokenizer(df.iloc[idx]["answer"]) + ['<end>']
                q_tokens = ['<start>'] + tokenizer(df.iloc[idx]["question"]) + ['<end>']
                # pass_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["passage"])) + ['<end>']
                # ans_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["answer"])) + ['<end>']
                # q_tokens = ['<start>'] + list(map(tokenizer, df.iloc[idx]["question"])) + ['<end>']

                pass_len = MAX_PASSAGE_LEN + 2 # +2 for start and end tokens
                ans_len = MAX_ANSWER_LEN + 2
                q_len = MAX_QUESTION_LEN + 2

                passage = [vocab(word) for word in pass_tokens]
                answer = [vocab(word) for word in ans_tokens]
                question = [vocab(word) for word in q_tokens]

                # padding to same length
                pass_idxs = torch.zeros(pass_len)
                ans_idxs = torch.zeros(ans_len)
                q_idxs = torch.zeros(q_len)

                pass_idxs[:len(passage)] = torch.FloatTensor(passage)
                ans_idxs[:len(answer)] = torch.FloatTensor(answer)
                q_idxs[:len(question)] = torch.FloatTensor(question)

                data.append((pass_idxs, ans_idxs, q_idxs))
        return data

In [None]:
tokenizer = get_tokenizer("basic_english")
train_processed = get_processed_data(train, tokenizer)
val_processed = get_processed_data(val, tokenizer)
test_processed = get_processed_data(test, tokenizer)

In [None]:
with open('./data/train_processed.pickle', 'wb') as train_file:
    pickle.dump(train_processed, train_file)

with open('./data/val_processed.pickle', 'wb') as val_file:
    pickle.dump(val_processed, val_file)

with open('./data/test_processed.pickle', 'wb') as test_file:
    pickle.dump(test_processed, test_file)