# Mounting Drive for dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries, Config and Utility Functions

## Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import random
import math
import spacy
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import Field, TabularDataset, BucketIterator


## Config

In [None]:
# !python -m spacy download en     ## If tokenization error try to update the spacyt package

DATASET_BASE_PATH = '/content/drive/My Drive/csv_for_nqg'
TRAIN_FILENAME =  'train-v2.csv'
VALID_FILENAME = 'dev-v2.csv'
train_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, TRAIN_FILENAME))
valid_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, VALID_FILENAME))

spacy_en = spacy.load('en')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NameError: ignored

## Setting seeds for deterministic results

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Utils

In [None]:
def tokenizer(text):
  '''
  Function for tokenizing with spacy english model
  '''
  return  [tok.text for tok in spacy_en.tokenizer(text)]

# Prepare Data

## Analysing Data

In [None]:
print(train_df.shape)
train_df.head()

(18222, 5)


Unnamed: 0,context,sentence,question,answer,sent_ans
0,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Beyonce would take a break from music in which...,2010,Beyoncé announced a hiatus from her music care...
1,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Which year did Beyonce and her father part bus...,2010,Beyoncé announced a hiatus from her music care...
2,Beyoncé announced a hiatus from her music care...,Beyoncé 's musical break lasted nine months an...,Which famous landmark did Beyonce see in China ?,the Great Wall of China,Beyoncé 's musical break lasted nine months an...
3,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,In what year did Beyonce have her hiatus ?,2010,Beyoncé announced a hiatus from her music care...
4,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Who inspired this hiatus ?,her mother,Beyoncé announced a hiatus from her music care...


## Creating Fields for Data
We only need one type of preprocessing for all required columns so we need one Field with
* sequential = **True** Since our input is text and it is sequential
* tokenize = We are tokenizing using a function called **tokenizer** which is build using a **spacy english langauge model**
* init_token => Need to append initial token to every input
* eos_token =>  Need to append end token to every input
* lower => We need to lower every tokens as a preprocessing step.

In [None]:
FIELD = Field(sequential=True,
                   tokenize=tokenizer,
                   init_token='<sos>',
                   eos_token='<eos>',
                   lower=True)

TRAIN_VALID_FIELDS = [('context', None), ## For this phase of the project we are not using context, So no field assigned
                    ('sentence', None), ## Sentence is one of the column of interest and preprocessing is needed so Field is also needed
                    ('question', FIELD), ## Question is another columnd of interest
                    ('answer', None), ## We will not be using answer at this phase
                    ('sent_ans', FIELD) ## Not now
                    ]

## Creating Dataset from fields

In [None]:
train_data, valid_data = TabularDataset.splits(path=DATASET_BASE_PATH,
                                        format='csv',
                                        train = TRAIN_FILENAME,
                                        validation = VALID_FILENAME,
                                        fields = TRAIN_VALID_FIELDS,
                                        skip_header=True)

## Checking data from dataset

In [None]:
print(f"Type of train data => {type(train_data)} and valid data => {type(valid_data)}")
print(f"Length of train data => {len(train_data)} and valid data => {len(valid_data)}")
print(train_data.fields.items())
example = train_data[0]
print(type(example))
print(example.sent_ans, example.question)
print(vars(train_data.examples[0]))

Type of train data => <class 'torchtext.data.dataset.TabularDataset'> and valid data => <class 'torchtext.data.dataset.TabularDataset'>
Length of train data => 18222 and valid data => 871
dict_items([('context', None), ('sentence', None), ('question', <torchtext.data.field.Field object at 0x7fd1e9a19dd8>), ('answer', None), ('sent_ans', <torchtext.data.field.Field object at 0x7fd1e9a19dd8>)])
<class 'torchtext.data.example.Example'>
['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', ',', 'heeding', 'her', 'mother', "'s", 'advice', ',', '"', 'to', 'live', 'life', ',', 'to', 'be', 'inspired', 'by', 'things', 'again', '"', '.', 'answer', '2010'] ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?']
{'question': ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?'], 'sent_ans': ['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', 

## Building Vocabulary from training_data

In [None]:
FIELD.build_vocab(train_data, valid_data, min_freq=2)

print(f"Total length of train vocabulary is {len(FIELD.vocab)}")


Total length of train vocabulary is 24953


## Prepare Batches of Data using Bucket Iterator
This also takes care of padding.

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.sent_ans),
    device = DEVICE
)

## Understanding Iterator
* Here we can see each iteration will give us a batch of text data.
* How to access data in batch 
* Fields associated in each batch

In [None]:
print(f"Size of the train and valid iterators =>>> {len(train_iterator)}, {len(valid_iterator)}")
batch = next(iter(train_iterator))
print(type(batch))
print(batch.sent_ans, batch.sent_ans.shape)
print(batch.question, batch.question.shape)
print(batch.dataset.fields)

Size of the train and valid iterators =>>> 143, 7
<class 'torchtext.data.batch.Batch'>
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  21,  172,   54,  ...,  247,  212,   41],
        [3384,   45,    5,  ..., 1153,    5,   15],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0') torch.Size([89, 128])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   7,   13,   13,  ...,   35,  354,  256],
        [3384,  565,   16,  ..., 2284,   40,  123],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0') torch.Size([21, 128])
{'context': None, 'sentence': None, 'question': <torchtext.data.field.Field object at 0x7fd1e9a19dd8>, 'answer': None, 'sent_ans': <torchtext.data.field.Field object at 0x7fd1e9a19dd8

**NOTE**
* You can see that BucketIterator returns a batch object instead of sentence tensors and question tensors
* Also we can't iterate through batch object 
- We can overcome this problem by writing a wrapper around Iterator object which will return required data or we can write some extra code for the same. In this tutorial we will move with the second approach

# Recap

### Let's see what all things we have done until now.
### **Initial Steps**
1. Downloaded the SQUAD dataset
2. Preprocess the dataset and converted into a csv with the following coloumns
  - context 
  - sentence
  - question
  - answer  
3. Uploaded train and test csv files to google drive.
4. Mounted Drive in colab such that we don't have to upload data again in each session.
### **Preparing Data**
5. Installed spacy and english model of spacy for tokenization.
6. Created **Fields** for train and test data. Since both needed same type of preprocessing and steps we used the same field for both.
7. Created the dataset from CSV using **TabularDataset** a torchtext library. Which will help handling the data and also gives the preprocessed data as output with given field as argument.
8. Created DataIterators for batching the data with torchtext library called BuckerIterator. This also taken care of the padding. Since it works on minibatching we also specified a sorting parameter so that in each batch the iterator batches datas with similar length.

**Note** : The first points are not mentioned in the notebook. Kindly check my github repo for it.


# Building Model Architecture

## Encoder

In the forward method, we pass in the source sentence,  𝑋 , which is converted into dense vectors using the embedding layer, and then dropout is applied. These embeddings are then passed into the RNN. As we pass a whole sequence to the RNN, it will automatically do the recurrent calculation of the hidden states over the whole sequence for us! Notice that we do not pass an initial hidden or cell state to the RNN. This is because, as noted in the documentation, that if no hidden/cell state is passed to the RNN, it will automatically create an initial hidden/cell state as a tensor of all zeros.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

## Decoder Model

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

## Seq2Seq Model

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

# Training and Evaluation

## Initialize model and its parameters

In [None]:
INPUT_DIM = len(FIELD.vocab)
OUTPUT_DIM = len(FIELD.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

### Weight Initialization

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.5, 0.5)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(24953, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(24953, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=24953, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

### Counting trainable parameters

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 32,933,241 trainable parameters


## Train Function

In [None]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(iterator):
    sentence = batch.sent_ans
    question = batch.question

    ## Zeroing optimizer
    optimizer.zero_grad()
    ## One pass to model with single batch
    outputs = model(sentence, question)
    output_dim = outputs.shape[-1]
    output = outputs[1:].view(-1, output_dim)
    question = question[1:].view(-1)
    ## finding loss
    loss = criterion(output, question)
    ## Finding gradients
    loss.backward()
    ## Updating weights
    optimizer.step()
    torch.nn.utils.clip_grad_norm(model.parameters(), clip)
    epoch_loss += loss.item()
  
  return epoch_loss/len(iterator)

## Validation Function

In [None]:
def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      sentence = batch.sent_ans
      question = batch.question
      outputs = model(sentence, question)
      output_dim = outputs.shape[-1]
      output = outputs[1:].view(-1, output_dim)
      question = question[1:].view(-1)
      loss = criterion(output, question)
      epoch_loss += loss
  return epoch_loss/len(iterator)


## Hyper parameters

In [None]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = FIELD.vocab.stoi[FIELD.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## Function for time

In [None]:
def elapsed_time(start_time, end_time):
  total_time = end_time - start_time
  epoch_mins = int((total_time)/60)
  epoch_secs = int(total_time - (epoch_mins*60))
  return epoch_mins, epoch_secs

## Start Training

In [None]:
N_EPOCHS = 20
CLIP = 1
BEST_VALID_LOSS = np.Inf
MODEL_DIR = os.path.join(DATASET_BASE_PATH, 'model.pt')
for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss = evaluate(model, valid_iterator, criterion)
  if valid_loss < BEST_VALID_LOSS:
    BEST_VALID_LOSS = valid_loss
    torch.save(model.state_dict(), MODEL_DIR)
  end_time = time.time()
  epoch_min, epoch_sec = elapsed_time(start_time, end_time)
  print(f"Epoch:{epoch}")
  print(f"TIME: {epoch_min} minutes : {epoch_sec} seconds")
  print(f"Training loss {train_loss:.3f} ||  Perplexity {math.exp(train_loss):7.3f}")
  print(f"Validation loss {valid_loss:.3f} ||  Perplexity {math.exp(valid_loss):7.3f}")



Epoch:0
TIME: 0 minutes : 53 seconds
Training loss 10.191 ||  Perplexity 26649.339
Validation loss 8.235 ||  Perplexity 3770.117
Epoch:1
TIME: 0 minutes : 54 seconds
Training loss 7.110 ||  Perplexity 1224.240
Validation loss 6.927 ||  Perplexity 1019.037
Epoch:2
TIME: 0 minutes : 54 seconds
Training loss 6.392 ||  Perplexity 597.259
Validation loss 6.670 ||  Perplexity 788.066
Epoch:3
TIME: 0 minutes : 54 seconds
Training loss 6.130 ||  Perplexity 459.511
Validation loss 6.553 ||  Perplexity 701.328
Epoch:4
TIME: 0 minutes : 54 seconds
Training loss 6.005 ||  Perplexity 405.520
Validation loss 6.493 ||  Perplexity 660.677
Epoch:5
TIME: 0 minutes : 54 seconds
Training loss 5.936 ||  Perplexity 378.295
Validation loss 6.491 ||  Perplexity 659.335
Epoch:6
TIME: 0 minutes : 54 seconds
Training loss 5.890 ||  Perplexity 361.346
Validation loss 6.470 ||  Perplexity 645.485
Epoch:7
TIME: 0 minutes : 55 seconds
Training loss 5.853 ||  Perplexity 348.344
Validation loss 6.449 ||  Perplexity 63

## Final Evaluation

In [None]:
MODEL_DIR = os.path.join(DATASET_BASE_PATH, 'model.pt')
model.load_state_dict(torch.load(MODEL_DIR))

test_loss = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 6.428 | Test PPL: 619.131 |
