In [1]:
!pip install icecream
!pip install datasets
!pip install spacy
!python3 -m spacy download en_core_web_sm

Collecting icecream
  Downloading icecream-2.1.1-py2.py3-none-any.whl (8.1 kB)
Collecting executing>=0.3.1
  Downloading executing-0.8.2-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.0.5 colorama-0.4.4 executing-0.8.2 icecream-2.1.1
Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 48.7 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 485 kB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m

In [2]:
from icecream import ic
import spacy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm
import en_core_web_sm

In [3]:
nlp = en_core_web_sm.load()

In [53]:
MAXLEN = 64
BATCHSIZE = 32
VECTORLEN = 99
EMBEDDING_SIZE = 64
HIDDEN_SIZE = 128
LR = 0.01
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [5]:
CONFIG ={
    "epochs": 100,
    "batch_size": 32,
    "learning_rate": 0.1,
    "hidden_size": 128,
    "n_layers": 2,
    "drop_prob": 0.3,
    "embedding_size": 64
}

In [6]:
train = load_dataset('squad', split='train')
test = load_dataset('squad', split='validation')

Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [7]:
train[1]

{'answers': {'answer_start': [188], 'text': ['a copper statue of Christ']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f4190066117f',
 'question': 'What is in front of the Notre Dame Main Building?',
 'title': 'University_of_Notre_Dame'}

In [8]:
class Convert:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        self.vocab = 0

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.vocab
            self.index2word[self.vocab] = word
            self.vocab += 1
    
    def getWord(self, index):
        if index in self.index2word:
            return self.index2word[index]
        else:
            return 'UNK'
    
    def getIndex(self, word):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return -1

In [9]:
c = Convert()

In [10]:
c.addWord('PAD')

In [46]:
class Dataprep(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def indice(self, data):
        doc = nlp(data)
        indices = []
        for w in doc:
            index = c.getIndex(w.text)
            indices.append(index)
        return indices

    def vectorise(self, data):
        doc = nlp(data)
        vectors = []
        for w in doc:
            vector = w.vector
            c.addWord(w.text)
            c.addWord(w.pos_)
            c.addWord(w.dep_)
            vector = np.append(c.getIndex(w.text), vector)
            vector = np.append(c.getIndex(w.pos_), vector)
            vector = np.append(c.getIndex(w.dep_), vector)
            vector = vector.tolist()
            vectors.append(vector)
        return vectors

    def __getitem__(self, index):
        values = self.data[index]
        answer = values['answers']['text'][0]
        context = values['context']
        question = values['question']
        title = values['title']

        answer = self.vectorise(answer)
        context = self.vectorise(context)
        question = self.vectorise(question)
        title = self.vectorise(title)
        
        answer = answer + [[0]*len(answer[0])]*64
        context = context + [[0]*len(answer[0])]*64
        question = question + [[0]*len(answer[0])]*64
        title = title + [[0]*len(answer[0])]*64

        answer = answer[:MAXLEN]
        context = context[:MAXLEN]
        question = question[:MAXLEN]
        title = title[:MAXLEN]

        answer = torch.FloatTensor(answer)
        context = torch.FloatTensor(context)
        question = torch.FloatTensor(question)
        title = torch.FloatTensor(title)

        return (answer, context, title), question

In [47]:
train_prep = Dataprep(train)
test_prep = Dataprep(test)

In [48]:
type(train_prep)

__main__.Dataprep

In [49]:
train_loader = torch.utils.data.DataLoader(train_prep, batch_size=BATCHSIZE)
test_loader = torch.utils.data.DataLoader(test_prep, batch_size=BATCHSIZE)

In [50]:
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [51]:
count = 0

In [52]:
for x, y in tqdm(train_loader):
    count += 1
    if count == 100:
        break
    pass

  0%|          | 0/2738 [00:00<?, ?it/s]

In [54]:
vocab_len = c.vocab

In [55]:
vocab_len

11757

In [72]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, bidirectional=True, batch_first=True)

    def forward(self, context, answer, title):

        # answer.shape: torch.Size([32, 64, 98])
        # context.shape: torch.Size([32, 64, 98])
        # title.shape: torch.Size([32, 64, 98])

        source = torch.cat((context, answer), dim=1).to(DEVICE)
        source = torch.cat((source, title), dim=1).to(DEVICE)
        # source.shape: torch.Size([32, 192, 98])

        output, (hidden, cell) = self.lstm(source)
        # output.shape: torch.Size([32, 192, 256])
        # hidden.shape: torch.Size([2, 32, 128])
        # cell.shape: torch.Size([2, 32, 128])
        
        return output, hidden, cell

In [73]:
encoder = Encoder(
    input_size = VECTORLEN,
    hidden_size = HIDDEN_SIZE
).to(DEVICE)
encoder

Encoder(
  (lstm): LSTM(99, 128, batch_first=True, bidirectional=True)
)

In [74]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        
        self.linear = nn.Linear(3*self.hidden_size, self.hidden_size)
        self.energy = nn.Linear(self.hidden_size, 1)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, e_context, hidden):
        seq = e_context.shape[1]
        hidden = hidden.to(DEVICE)
        e_context = e_context.to(DEVICE)

        hidden = hidden[-1, :, :]
        hidden = hidden.unsqueeze(1)
        hidden = hidden.repeat(1, seq, 1)
        # hidden.shape: torch.Size([32, 192, 128])
        
        source = torch.cat((hidden, e_context), dim=2).to(DEVICE)
        # source.shape: torch.Size([32, 192, 384])

        attention = self.linear(source)
        # attention.shape: torch.Size([32, 192, 128])

        energies = self.energy(attention)
        energies.squeeze(2)
        # energies.shape: torch.Size([32, 192, 1])

        probs = self.softmax(energies)
        e_context = e_context.permute(0, 2, 1)
        # probs.shape: torch.Size([32, 192, 1])
        # e_context.shape: torch.Size([32, 256, 192])

        weights = (e_context@probs).to(DEVICE)
        weights = weights.permute(0, 2, 1)
        # weights.shape: torch.Size([32, 1, 256])

        return weights

In [75]:
attention = Attention(
    hidden_size = HIDDEN_SIZE
).to(DEVICE)
attention

Attention(
  (linear): Linear(in_features=384, out_features=128, bias=True)
  (energy): Linear(in_features=128, out_features=1, bias=True)
  (softmax): Softmax(dim=-1)
)

In [76]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, num_layers=2, batch_first=True)
        self.fc_out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.Softmax(dim=-1)
        self.sigmoid = nn.Sigmoid()
        self.represent = nn.Linear(6*self.hidden_size, self.output_size)

    def forward(self, question, e_context, hidden, cell):

        # question.shape: torch.Size([32, 98])
        # e_context.shape: torch.Size([32, 192, 256])
        # hidden.shape: torch.Size([2, 32, 128])
        # cell.shape: torch.Size([2, 32, 128])
        question = question.unsqueeze(1)
        d_context, (hidden, cell) = self.lstm(question, (hidden, cell))

        # d_context.shape: torch.Size([32, 1, 128])
        # hidden.shape: torch.Size([2, 32, 128])
        # cell.shape: torch.Size([2, 32, 128])

        weights = attention(e_context, hidden)
        context_vector = weights.squeeze(1)
        d_hidden = hidden.reshape(hidden.shape[1], hidden.shape[0]*hidden.shape[2])
        combined = torch.cat((context_vector, d_hidden), dim=1)
        combined = self.sigmoid(combined)
        d_hidden = self.sigmoid(d_hidden)
        
        total = torch.cat((combined, d_hidden), dim=1).to(DEVICE)
        total = self.represent(total)
        total = self.softmax(total)
        return total, hidden, cell

In [77]:
decoder = Decoder(
    input_size = VECTORLEN,
    hidden_size = HIDDEN_SIZE,
    output_size = vocab_len
).to(DEVICE)
decoder

Decoder(
  (lstm): LSTM(99, 128, num_layers=2, batch_first=True)
  (fc_out): Linear(in_features=128, out_features=11757, bias=True)
  (softmax): Softmax(dim=-1)
  (sigmoid): Sigmoid()
  (represent): Linear(in_features=768, out_features=11757, bias=True)
)

In [78]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, context, answer, title, question):
        context = context.to(DEVICE)
        answer = answer.to(DEVICE)
        title = title.to(DEVICE)
        question = question.to(DEVICE)

        e_context, hidden, cell = self.encoder(context, answer, title)
        token = question[:, 0]
        
        predictions = torch.zeros(
            MAXLEN - 1,
            BATCHSIZE,
            vocab_len
        ).to(DEVICE)

        for t in range(1, MAXLEN):
            represent, hidden, cell = decoder(token, e_context, hidden, cell)
            predictions[t-1] = represent
            token = question[:, t]
        
        return predictions

In [79]:
s2s = Seq2Seq(
    encoder = encoder,
    decoder = decoder
).to(DEVICE)
s2s

Seq2Seq(
  (encoder): Encoder(
    (lstm): LSTM(99, 128, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (lstm): LSTM(99, 128, num_layers=2, batch_first=True)
    (fc_out): Linear(in_features=128, out_features=11757, bias=True)
    (softmax): Softmax(dim=-1)
    (sigmoid): Sigmoid()
    (represent): Linear(in_features=768, out_features=11757, bias=True)
  )
)

In [80]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
s2s.apply(init_weights)
s2s.train()
optimizer = optim.Adam(s2s.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=c.getIndex('PAD'))

In [84]:
for x, y in tqdm(train_loader):
    answer = x[0].to(DEVICE)
    context = x[1].to(DEVICE)
    title = x[2].to(DEVICE)
    question = y.to(DEVICE)
    optimizer.zero_grad()
    predictions = s2s(context, answer, title, question)
    predictions = predictions.permute(1, 2, 0)
    question = question[:, 1:, 0]
    question = question.long()
    # ic(predictions.shape, question.shape)
    loss = criterion(predictions, question)
    loss.backward()
    optimizer.step()
    if len(losses) == 100:
        break

  0%|          | 0/2738 [00:00<?, ?it/s]