# Neural Question Generation using Attention.
**Updating**
* Masking - For the model to ignore certain values
* Packed Padded Sequences - RNN to skip padded sequences
### And
- Interference is taken <> Given a sentence => Question is Generated
- BLEU Scores for evaluation

# Step-1 : Initial Setup


## Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing Libraries

In [None]:
import torch
import torchtext
import torch.nn as nn
from torchtext.data import  Field, BucketIterator, TabularDataset
import torch.nn.functional as F
import torch.optim as optim

import os
import random
from tqdm import tqdm
import spacy
import math
import time
import numpy as np
import pandas as pd

## Config

In [None]:
# !python -m spacy download en
spacy_en = spacy.load('en')

DEVICE = torch.device ('cuda' if torch.cuda.is_available() else 'cpu')

DATASET_BASE_PATH = '/content/drive/My Drive/csv_for_nqg'
TRAIN_FILENAME =  'train-v2.csv'
VALID_FILENAME = 'dev-v2.csv'
train_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, TRAIN_FILENAME))
valid_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, VALID_FILENAME))

## Setting Seeds for deterministic Values

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Utility Functions

In [None]:
def tokenizer(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

# Step-2 : Preparing Data

## Analysing dataframe

In [None]:
train_df.head()

Unnamed: 0,context,sentence,question,answer,sent_ans
0,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Beyonce would take a break from music in which...,2010,Beyoncé announced a hiatus from her music care...
1,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Which year did Beyonce and her father part bus...,2010,Beyoncé announced a hiatus from her music care...
2,Beyoncé announced a hiatus from her music care...,Beyoncé 's musical break lasted nine months an...,Which famous landmark did Beyonce see in China ?,the Great Wall of China,Beyoncé 's musical break lasted nine months an...
3,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,In what year did Beyonce have her hiatus ?,2010,Beyoncé announced a hiatus from her music care...
4,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Who inspired this hiatus ?,her mother,Beyoncé announced a hiatus from her music care...


## Creating Fields

In [None]:
TRAIN_FIELD = Field(sequential = True,
                          init_token = '<sos>',
                          tokenize = tokenizer,
                          lower = True,
                          eos_token = '<eos>',
                          # include_lengths = True,
                          pad_token = '<pad>'
                          )

VALID_FIELD = Field(sequential = True,
                          init_token = '<sos>',
                          tokenize = tokenizer,
                          lower = True,
                          eos_token = '<eos>',
                          pad_token = '<pad>'
                          )

TRAIN_VAL_FIELDS = [('context', None),
                    ('sentence', None),
                    ('question', VALID_FIELD),
                    ('answer', None),
                    ('sent_ans', TRAIN_FIELD)]

## Creating Datasets

In [None]:
train_data, valid_data = TabularDataset.splits(path=DATASET_BASE_PATH,      
                                          format='csv',
                                          train=TRAIN_FILENAME,
                                          validation=VALID_FILENAME,
                                          fields=TRAIN_VAL_FIELDS,
                                          skip_header=True
                                          )

## Building Vocabulary

In [None]:
TRAIN_FIELD.build_vocab(train_data, valid_data, min_freq=2)
VALID_FIELD.build_vocab(train_data, valid_data, min_freq=2)

## Creating DataIterators

In [None]:
TRAIN_BATCH_SIZE, VALID_BATCH_SIZE = 128, 128

train_iter, valid_iter = BucketIterator.splits(datasets = (train_data, valid_data),
                                               batch_sizes = (TRAIN_BATCH_SIZE, VALID_BATCH_SIZE),
                                               sort_within_batch = True,
                                               sort_key = lambda x : len(x.sent_ans),
                                               device = DEVICE
                                               )

## Checking Datasets, Iterators and Fields

In [None]:
print(f"Lenght of train_data is {len(train_data)}, valid_data is {len(valid_data)}")
print(f"Type of train_data is {type(train_data)}, valid_data is {type(valid_data)}")
print(train_data.fields.items())
example = train_data[0]
print(type(example))
print(example.sent_ans, example.question)
print(vars(train_data.examples[0]))

Lenght of train_data is 18222, valid_data is 871
Type of train_data is <class 'torchtext.data.dataset.TabularDataset'>, valid_data is <class 'torchtext.data.dataset.TabularDataset'>
dict_items([('context', None), ('sentence', None), ('question', <torchtext.data.field.Field object at 0x7ff27ee027b8>), ('answer', None), ('sent_ans', <torchtext.data.field.Field object at 0x7ff27ee02710>)])
<class 'torchtext.data.example.Example'>
['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', ',', 'heeding', 'her', 'mother', "'s", 'advice', ',', '"', 'to', 'live', 'life', ',', 'to', 'be', 'inspired', 'by', 'things', 'again', '"', '.', 'answer', '2010'] ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?']
{'question': ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?'], 'sent_ans': ['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', ',', '

In [None]:
print(f"Size of batch  of the train_data is {len(train_iter)} and valid_data is {len(valid_iter)}")
batch = next(iter(train_iter))
print(type(batch))
print(batch.sent_ans, batch.sent_ans[0].shape)
print(batch.question, batch.question.shape)
print(batch.dataset.fields)
print(TRAIN_FIELD.vocab.stoi[TRAIN_FIELD.pad_token], TRAIN_FIELD.vocab.itos[2])
print(TRAIN_FIELD.pad_token, TRAIN_FIELD.pad_first)

Size of batch  of the train_data is 143 and valid_data is 7
<class 'torchtext.data.batch.Batch'>
tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [11078,     4,    10,  ...,    10,    10,   911],
        [  549,  1897,    65,  ...,   198,   198,  7229],
        ...,
        [   40,    89,     8,  ...,     8,     8,   742],
        [13561,  1259,  7752,  ...,   278,   198,   234],
        [    3,     3,     3,  ...,     3,     3,     3]], device='cuda:0') torch.Size([128])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  77,    6,    8,  ...,    8,    8,    0],
        [  33,  690,  163,  ...,    6,    6, 4932],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0') torch.Size([22, 128])
{'context': None, 'sentence': None, 'question': <torchtext.data.field.Field object at 0x7ff27ee027b8>, 'answer': None, 'sent_ans': <torchtext

# Step-3 : Building Model

## Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embed_dim, enc_hid_dim, dec_hid_dim, dropout):
    super(Encoder, self).__init__()
    
    self.embedding = nn.Embedding(input_dim, embed_dim)
    self.rnn = nn.GRU(embed_dim, enc_hid_dim, bidirectional = True)
    self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # src = [src_len, batch_size]
    embedded = self.embedding(src)
    # embedded = [src_len, batch_size, embed_dim]

    output, hidden = self.rnn(embedded)
    # output = [src_len, batch_size, n_direction*hid_dim]
    # hidden = [num_dir * num*layers, batch_size, hid_dim]

    #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
    #outputs are always from the last layer

    # hidden[-2, :, :] is the last layer of the forward RNN
    # hidden[-1, :, :] is the last layer of the backward RNN

    hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
    # hidden = [batch_size, dec_hid_dim]
    # output = [src_len, batch_size, 2*enc_hid_dim]
    return output, hidden

## Attention

* Attention will take in the previous hidden state of the decoder and all of the stacked forward and backward hidden states from the encoder,  𝐻 . The layer will output an attention vector,  𝑎𝑡 , that is the length of the source sentence, each element is between 0 and 1 and the entire vector sums to 1.
* First, we calculate the energy between the previous decoder hidden state and the encoder hidden states. As our encoder hidden states are a sequence of  𝑇  tensors, and our previous decoder hidden state is a single tensor, the first thing we do is repeat the previous decoder hidden state  𝑇  times.
* We currently have a [dec hid dim, src len] tensor for each example in the batch. We want this to be [src len] for each example in the batch as the attention should be over the length of the source sentence. This is achieved by multiplying the energy by a [1, dec hid dim] tensor,  𝑣 .
* Finally, we ensure the attention vector fits the constraints of having all elements between 0 and 1 and the vector summing to 1 by passing it through a  softmax  layer.

In [None]:
class Attention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super(Attention, self).__init__()
    
    self.attn = nn.Linear((enc_hid_dim*2)+dec_hid_dim, dec_hid_dim)
    self.v = nn.Linear(dec_hid_dim, 1, bias=False)

  def forward(self, enc_outputs, hidden):
    # enc_outputs = [src_len, batch_size, 2*enc_hid_dim]
    # hidden = [batch_size, dec_hid_dim]

    batch_size = enc_outputs.shape[1]
    src_len = enc_outputs.shape[0]
    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    enc_outputs = enc_outputs.permute(1, 0, 2)

    # hidden = [batch_size, src_len, dec_hid_dim]
    # enc_outputs = [batch_size, src_len, enc_hid_dim*2]
    combined = torch.cat((hidden, enc_outputs), dim=2)
    energy = torch.tanh(self.attn(combined))

    # energy = [batch_size, src_len, dec_hid_dim]
    attention = self.v(energy).squeeze(2)

    # attention = [batch_size, src_len]

    return F.softmax(attention, dim=1)
    

## Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embed_dim, enc_hid_dim, dec_hid_dim, attention, dropout):
    super(Decoder, self).__init__()
    self.output_dim = output_dim

    self.embedding = nn.Embedding(output_dim, embed_dim)
    self.attention = attention
    self.rnn = nn.GRU((enc_hid_dim*2) + embed_dim, dec_hid_dim)
    self.fc = nn.Linear(((enc_hid_dim*2) + dec_hid_dim + embed_dim), output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, encoder_outputs):
    # input = [batch_size]
    # hidden = [batch_size, dec_hid_dim]
    # encoder_outputs = [src_len, batch_size, 2*enc_hid_dim]
    input = input.unsqueeze(0)
    # input = [1, batch_size]
    embedded = self.dropout(self.embedding(input))
    # embedded = [1, batch_size, embed_dim]

    a = self.attention(encoder_outputs, hidden)
    # attention = [batch_size, src_len]
    a = a.unsqueeze(1)
    # attention = [batch_size, 1, src_len]
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    # encoder_outputs = [batch_size, src_len, 2*enc_hid_dim]

    weighted = torch.bmm(a, encoder_outputs)
    # weighted = [batch_size, 1, enc_hid_dim*2]
    weighted = weighted.permute(1, 0, 2)
    # weighted = [1, batch_size, enc_hid_dim*2]

    rnn_input = torch.cat((weighted, embedded), dim=2)

    output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

    #output = [1, batch size, dec hid dim]
    #hidden = [1, batch size, dec hid dim]
    
    assert (output==hidden).all()
    embedded = embedded.squeeze(0)
    output = output.squeeze(0)
    weighted = weighted.squeeze(0)
    
    prediction = self.fc(torch.cat((output, weighted, embedded), dim = 1))
    
    #prediction = [batch size, output dim]
    
    return prediction, hidden.squeeze(0)

## Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

# Step-4 : Train, Test and Helper Functions

## Train Function

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.sent_ans
        trg = batch.question
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Validation Function

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.sent_ans
            trg = batch.question

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Time Function

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Step-5 : Initialize models & Start Training

## Parameters and Creating Models

In [None]:
INPUT_DIM = len(TRAIN_FIELD.vocab)
OUTPUT_DIM = len(VALID_FIELD.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5


attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, attn, DEC_DROPOUT)

model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRAIN_FIELD.vocab.stoi[TRAIN_FIELD.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

## Weights Initialization and Count Parameters

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

The model has 30,788,347 trainable parameters


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(23462, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(8955, 256)
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (rnn): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=8955, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

## Start Training

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 44s
	Train Loss: 5.960 | Train PPL: 387.534
	 Val. Loss: 6.016 |  Val. PPL: 409.930
Epoch: 02 | Time: 1m 44s
	Train Loss: 5.265 | Train PPL: 193.396
	 Val. Loss: 5.966 |  Val. PPL: 390.027
Epoch: 03 | Time: 1m 43s
	Train Loss: 5.011 | Train PPL: 150.124
	 Val. Loss: 5.978 |  Val. PPL: 394.501
Epoch: 04 | Time: 1m 43s
	Train Loss: 4.762 | Train PPL: 116.925
	 Val. Loss: 6.023 |  Val. PPL: 412.640
Epoch: 05 | Time: 1m 44s
	Train Loss: 4.543 | Train PPL:  93.985
	 Val. Loss: 6.027 |  Val. PPL: 414.474
Epoch: 06 | Time: 1m 45s
	Train Loss: 4.273 | Train PPL:  71.763
	 Val. Loss: 6.137 |  Val. PPL: 462.549
Epoch: 07 | Time: 1m 44s
	Train Loss: 3.953 | Train PPL:  52.076
	 Val. Loss: 6.291 |  Val. PPL: 539.734
Epoch: 08 | Time: 1m 44s
	Train Loss: 3.640 | Train PPL:  38.109
	 Val. Loss: 6.345 |  Val. PPL: 569.440
Epoch: 09 | Time: 1m 44s
	Train Loss: 3.333 | Train PPL:  28.027
	 Val. Loss: 6.510 |  Val. PPL: 671.944


# Step-6 : Inference and Analysis

## Calulcating Validation loss

In [None]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss = evaluate(model, valid_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')