In [None]:
!pip install icecream
!pip install datasets
!pip install spacy
!python3 -m spacy download en_core_web_sm

Collecting icecream
  Downloading icecream-2.1.1-py2.py3-none-any.whl (8.1 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting executing>=0.3.1
  Downloading executing-0.8.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.0.5 colorama-0.4.4 executing-0.8.2 icecream-2.1.1
Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.2 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 48.8 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 46.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingfac

In [None]:
from icecream import ic
import spacy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm
import en_core_web_sm

In [None]:
nlp = en_core_web_sm.load()

In [None]:
MAXLEN = 64
BATCHSIZE = 32
VECTORLEN = 98
EMBEDDING_SIZE = 64
HIDDEN_SIZE = 128

In [None]:
CONFIG ={
    "epochs": 100,
    "batch_size": 32,
    "learning_rate": 0.1,
    "hidden_size": 128,
    "n_layers": 2,
    "drop_prob": 0.3,
    "embedding_size": 64
}

In [None]:
train = load_dataset('squad', split='train')
test = load_dataset('squad', split='validation')

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [None]:
train[1]

{'answers': {'answer_start': [188], 'text': ['a copper statue of Christ']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f4190066117f',
 'question': 'What is in front of the Notre Dame Main Building?',
 'title': 'University_of_Notre_Dame'}

In [None]:
class Convert:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        self.vocab = 0

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.vocab
            self.index2word[self.vocab] = word
            self.vocab += 1
    
    def getWord(self, index):
        if index in self.index2word:
            return self.index2word[index]
        else:
            return 'UNK'
    
    def getIndex(self, word):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return -1

In [None]:
c = Convert()

In [None]:
c.addWord('PAD')

In [None]:
class Dataprep(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def vectorise(self, data):
        doc = nlp(data)
        vectors = []
        for w in doc:
            vector = w.vector
            c.addWord(w.text)
            c.addWord(w.pos_)
            c.addWord(w.dep_)
            vector = np.append(c.getIndex(w.pos_), vector)
            vector = np.append(c.getIndex(w.dep_), vector)
            vector = vector.tolist()
            vectors.append(vector)
        return vectors

    def __getitem__(self, index):
        values = self.data[index]
        answer = values['answers']['text'][0]
        context = values['context']
        question = values['question']
        title = values['title']

        answer = self.vectorise(answer)
        context = self.vectorise(context)
        question = self.vectorise(question)
        title = self.vectorise(title)
        
        answer = answer + [[0]*len(answer[0])]*64
        context = context + [[0]*len(answer[0])]*64
        question = question + [[0]*len(answer[0])]*64
        title = title + [[0]*len(answer[0])]*64

        answer = answer[:MAXLEN]
        context = context[:MAXLEN]
        question = question[:MAXLEN]
        title = title[:MAXLEN]

        answer = torch.FloatTensor(answer)
        context = torch.FloatTensor(context)
        question = torch.FloatTensor(question)
        title = torch.FloatTensor(title)

        return (answer, context, title), question

In [None]:
train_prep = Dataprep(train)
test_prep = Dataprep(test)

In [None]:
type(train_prep)

__main__.Dataprep

In [None]:
train_loader = torch.utils.data.DataLoader(train_prep, batch_size=BATCHSIZE)
test_loader = torch.utils.data.DataLoader(test_prep, batch_size=BATCHSIZE)

In [None]:
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [None]:
count = 0

In [None]:
for x, y in tqdm(train_loader):
    count += 1
    if count == 100:
        break
    pass

  0%|          | 0/2738 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
vocab_len = c.vocab

In [None]:
vocab_len

12097

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, bidirectional=True, batch_first=True)

    def forward(self, context, answer, title):

        # answer.shape: torch.Size([32, 64, 98])
        # context.shape: torch.Size([32, 64, 98])
        # title.shape: torch.Size([32, 64, 98])

        source = torch.cat((context, answer), dim=1)
        source = torch.cat((source, title), dim=1)
        # source.shape: torch.Size([32, 192, 98])

        output, (hidden, cell) = self.lstm(source)
        # output.shape: torch.Size([32, 192, 256])
        # hidden.shape: torch.Size([2, 32, 128])
        # cell.shape: torch.Size([2, 32, 128])
        
        return output, hidden, cell

In [None]:
encoder = Encoder(
    input_size = VECTORLEN,
    hidden_size = HIDDEN_SIZE
)
encoder

Encoder(
  (lstm): LSTM(98, 128, batch_first=True, bidirectional=True)
)

In [None]:
for x, y in train_loader:
    answer = x[0]
    context = x[1]
    title = x[2]
    question = y
    ic(answer.shape, context.shape, title.shape, question.shape)
    encoder(context, answer, title)
    break

ic| answer.shape: torch.Size([32, 64, 98])
    context.shape: torch.Size([32, 64, 98])
    title.shape: torch.Size([32, 64, 98])
    question.shape: torch.Size([32, 64, 98])
ic| output.shape: torch.Size([32, 192, 256])
    hidden.shape: torch.Size([2, 32, 128])
    cell.shape: torch.Size([2, 32, 128])
