# Neural Seq2Seq Question Generation with Phrase Representation

# Mounting Drive for Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 1: Initial Setup

## Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import random
import math
import spacy
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import Field, TabularDataset, BucketIterator

## Configuration

In [None]:
# !python -m spacy download en     ## If tokenization error try to update the spacyt package

DATASET_BASE_PATH = '/content/drive/My Drive/csv_for_nqg'
TRAIN_FILENAME =  'train-v2.csv'
VALID_FILENAME = 'dev-v2.csv'
train_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, TRAIN_FILENAME))
valid_df = pd.read_csv(os.path.join(DATASET_BASE_PATH, VALID_FILENAME))

spacy_en = spacy.load('en')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Settting up seed for detereministic results

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnndeterministic = True

## Utility Functions

In [None]:
def tokenizer(text):
  '''
  Function for tokenizing with spacy english model
  '''
  return  [tok.text for tok in spacy_en.tokenizer(text)]

# Step-2 : Preparing Data
* Fields
* Dataset
* Iterators
* Build Vocabulary

## Check data

In [None]:
print(train_df.shape)
train_df.head()

(18222, 5)


Unnamed: 0,context,sentence,question,answer,sent_ans
0,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Beyonce would take a break from music in which...,2010,Beyoncé announced a hiatus from her music care...
1,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Which year did Beyonce and her father part bus...,2010,Beyoncé announced a hiatus from her music care...
2,Beyoncé announced a hiatus from her music care...,Beyoncé 's musical break lasted nine months an...,Which famous landmark did Beyonce see in China ?,the Great Wall of China,Beyoncé 's musical break lasted nine months an...
3,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,In what year did Beyonce have her hiatus ?,2010,Beyoncé announced a hiatus from her music care...
4,Beyoncé announced a hiatus from her music care...,Beyoncé announced a hiatus from her music care...,Who inspired this hiatus ?,her mother,Beyoncé announced a hiatus from her music care...


## Creating Fields

In [None]:
FIELD = Field(sequential=True,
                   tokenize=tokenizer,
                   init_token='<sos>',
                   eos_token='<eos>',
                   lower=True)

TRAIN_VALID_FIELDS = [('context', None), ## For this phase of the project we are not using context, So no field assigned
                    ('sentence', None), ## Sentence is one of the column of interest and preprocessing is needed so Field is also needed
                    ('question', FIELD), ## Question is another columnd of interest
                    ('answer', None), ## We will not be using answer at this phase
                    ('sent_ans', FIELD) ## Not now
                    ]

## Creating datasets with Fields

In [None]:
train_data, valid_data = TabularDataset.splits(path=DATASET_BASE_PATH,
                                        format='csv',
                                        train = TRAIN_FILENAME,
                                        validation = VALID_FILENAME,
                                        fields = TRAIN_VALID_FIELDS,
                                        skip_header=True)

## Analysing dataset

In [None]:
print(f"Type of train data => {type(train_data)} and valid data => {type(valid_data)}")
print(f"Length of train data => {len(train_data)} and valid data => {len(valid_data)}")
print(train_data.fields.items())
example = train_data[0]
print(type(example))
# print(example.sentence, example.question)
print(example.sent_ans, example.question)
print(vars(train_data.examples[0]))

Type of train data => <class 'torchtext.data.dataset.TabularDataset'> and valid data => <class 'torchtext.data.dataset.TabularDataset'>
Length of train data => 18222 and valid data => 871
dict_items([('context', None), ('sentence', None), ('question', <torchtext.data.field.Field object at 0x7f105c4dfa58>), ('answer', None), ('sent_ans', <torchtext.data.field.Field object at 0x7f105c4dfa58>)])
<class 'torchtext.data.example.Example'>
['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', ',', 'heeding', 'her', 'mother', "'s", 'advice', ',', '"', 'to', 'live', 'life', ',', 'to', 'be', 'inspired', 'by', 'things', 'again', '"', '.', 'answer', '2010'] ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?']
{'question': ['beyonce', 'would', 'take', 'a', 'break', 'from', 'music', 'in', 'which', 'year', '?'], 'sent_ans': ['beyoncé', 'announced', 'a', 'hiatus', 'from', 'her', 'music', 'career', 'in', 'january', '2010', 

## Building Vocabulary

In [None]:
FIELD.build_vocab(train_data, valid_data, min_freq=2)

print(f"Total length of train vocabulary is {len(FIELD.vocab)}")

Total length of train vocabulary is 24953


## Data Iterator for batches of data

Takes care about padding

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data),
    batch_size = BATCH_SIZE,
    # sort_key = lambda x: len(x.sentence),
    sort_key = lambda x: len(x.sent_ans),
    device = DEVICE
)

## Understanding Iterator


In [None]:
print(f"Size of the train and valid iterators =>>> {len(train_iterator)}, {len(valid_iterator)}")
batch = next(iter(train_iterator))
print(type(batch))
# print(batch.sentence, batch.sentence.shape)   
print(batch.sent_ans, batch.sent_ans.shape)
print(batch.question, batch.question.shape)
print(batch.dataset.fields)

Size of the train and valid iterators =>>> 143, 7
<class 'torchtext.data.batch.Batch'>
tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [    7,  1011,     4,  ...,     7,    79, 13501],
        [  823,   315,   130,  ...,  7532,  2290,  1001],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:0') torch.Size([86, 128])
tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [   13,    19,   507,  ...,    13, 17868,    13],
        [   16,    13,  3222,  ...,   112,  2655,    15],
        ...,
        [    1,   394,     1,  ...,    10,     1,     1],
        [    1,    10,     1,  ...,     3,     1,     1],
        [    1,     3,     1,  ...,     1,     1,     1]], device='cuda:0') torch.Size([21, 128])
{'context': None, 'sentence': None, 'question': <torchtext.data.field.Field object at 0x7f105c4dfa58>, 'answer'

### Note:

* You can see that BucketIterator returns a batch object instead of sentence tensors and question tensors
* Also we can't iterate through batch object
* We can overcome this problem by writing a wrapper around Iterator object which will return required data or we can write some extra code for the same. In this tutorial we will move with the second approach.

# Step-3 : Building Model

We are going to use phrase representation.
* **Why?**
  -We have seen a difficulty of decoder RNN to compress all the information of context vector and also prevviously predicted in a vector.
* **What is phrase representation?**
- We are giving the context and also embedded previous input for every linear layer and also each RNN inputs such that since we are pro.

Compared to the previous notebook we only need to add changes to the decoder model

## Encoder Model


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

## Decoder Model

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden

## Seq2Seq Model

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        
        #context also used as the initial hidden state of the decoder
        hidden = context
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

# Step-4 : Setting up parameters and other functions required for training

## Setting model and parameters

In [None]:
INPUT_DIM = len(FIELD.vocab)
OUTPUT_DIM = len(FIELD.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

## Initializing weights and counting parameters

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(24953, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(24953, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=24953, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 47,892,601 trainable parameters


## Optimizer and loss functions

In [None]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = FIELD.vocab.stoi[FIELD.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## Time function for evaluating time

In [None]:
def epoch_time(start_time, end_time):
  total_time = end_time - start_time
  epoch_mins = int((total_time)/60)
  epoch_secs = int(total_time - (epoch_mins*60))
  return epoch_mins, epoch_secs

## Training and Evaluation function

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.sent_ans
        # src = batch.sentence
        trg = batch.question
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.sent_ans
            # src = batch.sentence
            trg = batch.question
            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Start Training

In [None]:
N_EPOCHS = 20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 57s
	Train Loss: 6.211 | Train PPL: 498.376
	 Val. Loss: 6.348 |  Val. PPL: 571.330
Epoch: 02 | Time: 0m 57s
	Train Loss: 5.545 | Train PPL: 255.894
	 Val. Loss: 6.425 |  Val. PPL: 616.815
Epoch: 03 | Time: 0m 58s
	Train Loss: 5.389 | Train PPL: 219.093
	 Val. Loss: 6.469 |  Val. PPL: 644.576
Epoch: 04 | Time: 0m 59s
	Train Loss: 5.270 | Train PPL: 194.417
	 Val. Loss: 6.527 |  Val. PPL: 683.455
Epoch: 05 | Time: 0m 58s
	Train Loss: 5.165 | Train PPL: 175.047
	 Val. Loss: 6.612 |  Val. PPL: 744.259
Epoch: 06 | Time: 0m 59s
	Train Loss: 5.092 | Train PPL: 162.738
	 Val. Loss: 6.583 |  Val. PPL: 722.838
Epoch: 07 | Time: 0m 59s
	Train Loss: 4.981 | Train PPL: 145.571
	 Val. Loss: 6.664 |  Val. PPL: 783.996
Epoch: 08 | Time: 0m 59s
	Train Loss: 4.898 | Train PPL: 133.959
	 Val. Loss: 6.760 |  Val. PPL: 862.946
Epoch: 09 | Time: 0m 59s
	Train Loss: 4.806 | Train PPL: 122.277
	 Val. Loss: 6.679 |  Val. PPL: 795.670
Epoch: 10 | Time: 0m 59s
	Train Loss: 4.678 | Train PPL

# Results and Evaluation

In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 6.348 | Test PPL: 571.330 |
