<a href="https://colab.research.google.com/github/satyajitghana/TSAI-DeepNLP-END2.0/blob/main/07_Seq2Seq/WikiQA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

Thu Jun 24 19:43:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Wiki QA Model

In [None]:
! pip install colorlog --quiet
! pip install gdown==3.13.0 --quiet
! pip install spacy==3.0.6 --quiet
! pip install pytorch-lightning --quiet
! pip install omegaconf --quiet

In [None]:
! pip install torch==1.8.1+cu102 torchtext==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
! python -m spacy download en_core_web_sm

Import Everything

In [None]:
import torch
import torchtext
import gdown
import math

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy

from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from torch.nn import functional as F
import torch.optim as optim

from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer

import torchtext.experimental.functional as text_f

from pathlib import Path
from functools import partial

from tqdm.auto import tqdm

from collections import Counter
from torchtext.vocab import Vocab

from sklearn.model_selection import train_test_split

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from typing import *

import random

from tqdm.auto import tqdm
from omegaconf import OmegaConf

from IPython.display import display, HTML

sns.set()

  "`pytorch_lightning.metrics.*` module has been renamed to `torchmetrics.*` and split off to its own package"


## WikiQA Dataset

In [None]:
def build_vocab_from_iterator(iterator, num_lines=None, *args, **kwargs):
    """
    Build a Vocab from an iterator.

    Args:
        iterator: Iterator used to build Vocab. Must yield list or iterator of tokens.
        num_lines: The expected number of elements returned by the iterator.
            (Default: None)
            Optionally, if known, the expected number of elements can be passed to
            this factory function for improved progress reporting.
    """

    counter = Counter()
    with tqdm(unit_scale=0, unit='lines', total=num_lines) as t:
        for tokens in iterator:
            counter.update(tokens)
            t.update(1)
    word_vocab = Vocab(counter, *args, **kwargs)
    return word_vocab

class WikiQADataset(Dataset):
    """
    Wiki QA Dataset
    """

    URL = 'https://drive.google.com/uc?id=1FFTtPmxu63Dljelg8YsRRn8Yz475MWyv'
    OUTPUT = 'wikiqa_dataset.csv'
 

    def __init__(self, root, split='train', vocab=None, vectors=None, text_transforms=None, label_transforms=None, ngrams=1):
        """Initiate dataset.
        Args:
            vocab: Vocabulary object used for dataset.
        """

        super(self.__class__, self).__init__()

        if vectors:
            raise NotImplementedError(f'vectors not supported for this dataset as of now') 

        if split not in ['train', 'test']:
            raise ValueError(f'split must be either "train" or "test" unknown split {split}')

        if vocab and vectors:
            raise ValueError(f'both vocab and vectors cannot be provided')

        self.vocab = vocab
        self.vectors = vectors

        gdown.cached_download(self.URL, Path(root) / self.OUTPUT)

        self.generate_tweet_dataset(Path(root) / self.OUTPUT)

        # split the dataset into train and test, and make sure to use random_state
        self.train_dset, self.test_dset = train_test_split(self.full_dataset_, test_size=0.3, random_state=42)

        if split == 'train':
            self.dataset = self.train_dset
        elif split == 'test':
            self.dataset = self.test_dset
        else:
            raise ValueError("What did you do? you stupid potato?")

        # create the tokenizer, here we use spacy
        tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
        self.tokenizer = tokenizer

        # the text transform can only work at the sentence level
        # the rest of tokenization and vocab is done by this class
        self.text_transform = text_f.sequential_transforms(tokenizer, text_f.ngrams_func(ngrams))

        self.vocab_transforms = text_f.sequential_transforms()
        self.vector_transforms = text_f.sequential_transforms()

        def build_vocab(data, transforms):
            def apply_transforms(data):
                for line in data:
                    yield transforms(line)
            return build_vocab_from_iterator(apply_transforms(data), len(data), specials=['<unk>', '<pad>', '<bos>', '<eos>'])

        if self.vectors:
            self.vector_transforms = text_f.sequential_transforms(
                partial(vectors.get_vecs_by_tokens, lower_case_backup=True)
            )
        elif self.vocab is None:
            self.vocab = build_vocab(
                pd.concat([self.train_dset['Question'], self.train_dset['Answer']]),
                self.text_transform
            )
            # these are only generated if you use vocab, which you should
            self.PAD_IDX = self.vocab['<pad>']
            self.BOS_IDX = self.vocab['<bos>']
            self.EOS_IDX = self.vocab['<eos>']
    
        # if the user is using vocab, instead of vector
        if self.vocab:
            self.vocab_transforms = text_f.sequential_transforms(
                text_f.vocab_func(self.vocab), text_f.totensor(dtype=torch.long)
            )

        # label transform is similar to text_transform for this dataset except this does not have vectors
        self.label_transform = text_f.sequential_transforms(
            self.text_transform, self.vocab_transforms
        )

        if text_transforms is not None:
            self.text_transform = text_f.sequential_transforms(
                self.text_transform, text_transforms, self.vocab_transforms, self.vector_transforms 
            )
        else:
            self.text_transform = text_f.sequential_transforms(
                self.text_transform, self.vocab_transforms, self.vector_transforms
            )
        

    def generate_tweet_dataset(self, dataset_file):
        self.full_dataset_ = pd.read_csv(dataset_file)

    def __getitem__(self, idx):
        text = self.text_transform(self.dataset['Question'].iloc[idx])
        label = self.label_transform(self.dataset['Answer'].iloc[idx])
        return label, text

    def __len__(self):
        return len(self.dataset)

    def get_vocab(self):
        return self.vocab

    def get_vectors(self):
        return self.vectors

    def batch_sampler_fn(self):
        raise NotImplementedError(f'batch_sampler function WIP')

        def batch_sampler():
            indices = [(i, len(self.tokenizer(s[1]))) for i, s in enumerate(train_list)]
            random.shuffle(indices)
            pooled_indices = []
            # create pool of indices with similar lengths 
            for i in range(0, len(indices), batch_size * 100):
                pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

            pooled_indices = [x[0] for x in pooled_indices]

            # yield indices for current batch
            for i in range(0, len(pooled_indices), batch_size):
                yield pooled_indices[i:i + batch_size]
        return batch_sampler()


    def collator_fn(self):
        def collate_fn(batch):
            
            targets, sequences = zip(*batch)

            lengths = torch.LongTensor([len(sequence) for sequence in sequences])

            targets = [torch.cat([torch.tensor([self.BOS_IDX]), item, torch.tensor([self.EOS_IDX])]) for item in targets]
            sequences = [torch.cat([torch.tensor([self.BOS_IDX]), item, torch.tensor([self.EOS_IDX])]) for item in sequences]

            if not self.vectors:
                pad_idx = self.PAD_IDX
                # the model used for this dataset does not expect batch_first=True
                # which is an anti-pattern for me, i will change this from next
                # session, i hate it when it's not batch_first
                sequences = torch.nn.utils.rnn.pad_sequence(sequences, 
                                                            padding_value = pad_idx,
                                                            # batch_first=True
                                                            )
                targets = torch.nn.utils.rnn.pad_sequence(targets, 
                                            padding_value = pad_idx,
                                            # batch_first=True
                                            )
            
            return targets, sequences, lengths
        
        return collate_fn

class WikiQADataModule(pl.LightningDataModule):
    """
    DataModule for Quora Similar Questions, train, val, test splits and transforms
    """

    name = "quora similar questions"

    def __init__(
        self,
        data_dir: str = '.',
        num_workers: int = 2,
        batch_size: int = 128,
        *args,
        **kwargs,
    ):
        """
        Args:
            data_dir: where to save/load the data
            val_split: how many of the training images to use for the validation split
            num_workers: how many workers to use for loading data
            normalize: If true applies image normalize
            batch_size: desired batch size.
        """
        super().__init__(*args, **kwargs)

        self.data_dir = data_dir
        self.num_workers = num_workers
        self.batch_size = batch_size

        self.dataset_train = ...
        self.dataset_val = ...
        self.dataset_test = ...
        self.vectors = ...

        self.Dataset = WikiQADataset

    def prepare_data(self):
        """Saves Dataset files to `data_dir`"""
        self.Dataset(self.data_dir)

    def setup(self, stage: Optional[str] = None):
        """Split the train and valid dataset"""

        self.dataset_train = self.Dataset(self.data_dir, split='train')
        self.dataset_test = self.Dataset(self.data_dir, split='test')

    def train_dataloader(self):
        loader = DataLoader(
            self.dataset_train,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def val_dataloader(self):
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def test_dataloader(self):
        loader = DataLoader(
            self.dataset_test,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collator_fn
        )
        return loader

    def get_vocab(self):
        return self.dataset_train.get_vocab()
    
    def get_vectors(self):
        return self.dataset_train.get_vectors()

    @property
    def default_transforms(self):
        raise NotImplementedError(f'default_transforms not implemented')
        train_transforms = {
            'text_transforms': text_f.sequential_transforms(
                random_deletion,
                random_swap
            ),
            'label_transforms': None
        }
        test_transforms = {
            'text_transforms': None,
            'label_transforms': None
        }

        return train_transforms, test_transforms

    @property
    def collator_fn(self):
        return self.dataset_train.collator_fn()

## WikiQA Model

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, text_length):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))

        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'), enforce_sorted=False)
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(pl.LightningModule):

    def __init__(self, hparams, *args, **kwargs):
        super().__init__()

        self.save_hyperparameters(hparams)

        self.encoder = Encoder(
            self.hparams.input_dim,
            self.hparams.enc_emb_dim,
            self.hparams.hidden_dim, 
            self.hparams.n_layers, 
            self.hparams.enc_dropout
        )

        self.decoder = Decoder(
            self.hparams.output_dim,
            self.hparams.dec_emb_dim,
            self.hparams.hidden_dim,
            self.hparams.n_layers,
            self.hparams.dec_dropout
        )

        assert self.encoder.hid_dim == self.decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert self.encoder.n_layers == self.decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

        # notice how we can ignore the pad_idx :)
        self.loss = nn.CrossEntropyLoss(ignore_index = self.hparams.pad_idx)

    def forward(self, sequences, targets, text_lengths, teacher_forcing_ratio = 0.5):

        src = sequences
        trg = targets
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src, text_lengths)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs


    def shared_step(self, batch, batch_idx):
        targets, sequences, lengths = batch

        src = sequences
        trg = targets

        output = self(src, trg, lengths)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = self.loss(output, trg)

        torch.nn.utils.clip_grad_norm_(self.parameters(), self.hparams.clip)

        metric = {'loss': loss, 'ppl': torch.tensor(math.exp(loss))} 
        
        return metric


    def training_step(self, batch, batch_idx):
        metrics = self.shared_step(batch, batch_idx)

        return metrics

    def training_epoch_end(self, outputs):
        ppl = torch.stack([x['ppl'] for x in outputs]).mean()
        loss = torch.stack([x['loss'] for x in outputs]).mean()

        log_metrics = {'train_loss': loss, 'train_ppl': ppl}

        self.log_dict(log_metrics, prog_bar=True)


    def validation_step(self, batch, batch_idx):
        metrics = self.shared_step(batch, batch_idx)

        return metrics
    

    def validation_epoch_end(self, outputs):
        ppl = torch.stack([x['ppl'] for x in outputs]).mean()
        loss = torch.stack([x['loss'] for x in outputs]).mean()

        log_metrics = {'val_loss': loss, 'val_ppl': ppl}

        print(f'Epoch: {self.current_epoch}, Test PPL: {ppl}, Test Loss: {loss}')

        self.log_dict(log_metrics, prog_bar=True)

        return log_metrics


    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def test_epoch_end(self, outputs):
        ppl = torch.stack([x['ppl'] for x in outputs]).mean()

        self.log('hp_metric', ppl)

        self.log_dict({'test_ppl': ppl}, prog_bar=True)


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer

In [None]:
wikiqa_dataset = WikiQADataModule(data_dir='.', batch_size=128, num_workers=2)
wikiqa_dataset.setup()

Cached Downloading: wikiqa_dataset.csv
Downloading...
From: https://drive.google.com/uc?id=1FFTtPmxu63Dljelg8YsRRn8Yz475MWyv
To: /root/.cache/gdown/tmpdfvm2dq9/dl
100%|██████████| 275k/275k [00:00<00:00, 9.42MB/s]


HBox(children=(FloatProgress(value=0.0, max=4790.0), HTML(value='')))


File exists: wikiqa_dataset.csv


HBox(children=(FloatProgress(value=0.0, max=4790.0), HTML(value='')))




In [None]:
hparams = OmegaConf.create({
    'input_dim': len(wikiqa_dataset.get_vocab()),
    'output_dim': len(wikiqa_dataset.get_vocab()),
    'enc_emb_dim': 256,
    'dec_emb_dim': 256,
    'hidden_dim': 512,
    'n_layers': 2,
    'enc_dropout': 0.5,
    'dec_dropout': 0.5,
    'clip': 1,
    'pad_idx': wikiqa_dataset.dataset_train.PAD_IDX,
    'lr': 5e-4,
    'epochs': 10,
    'use_lr_finder': False
})

In [None]:
wikiqa_model = Seq2Seq(hparams)

In [None]:
trainer = pl.Trainer(gpus=1, max_epochs=hparams.epochs, progress_bar_refresh_rate=1, reload_dataloaders_every_epoch=False)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(wikiqa_model, wikiqa_dataset)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | encoder | Encoder          | 5.1 M 
1 | decoder | Decoder          | 7.8 M 
2 | loss    | CrossEntropyLoss | 0     
---------------------------------------------
12.9 M    Trainable params
0         Non-trainable params
12.9 M    Total params
51.496    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Epoch: 0, Test PPL: 5423.5634765625, Test Loss: 8.59850788116455


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 0, Test PPL: 329.2035827636719, Test Loss: 5.630014896392822


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 1, Test PPL: 323.3686828613281, Test Loss: 5.590897560119629


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 2, Test PPL: 320.0765380859375, Test Loss: 5.55886173248291


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 3, Test PPL: 308.7172546386719, Test Loss: 5.521171569824219


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 4, Test PPL: 328.9159851074219, Test Loss: 5.575353622436523


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 5, Test PPL: 323.0148620605469, Test Loss: 5.544531345367432


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 6, Test PPL: 321.3564147949219, Test Loss: 5.539650917053223


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 7, Test PPL: 338.40802001953125, Test Loss: 5.586311340332031


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 8, Test PPL: 344.0347595214844, Test Loss: 5.59952449798584


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch: 9, Test PPL: 352.4696960449219, Test Loss: 5.6175217628479



## Tensorboard Logs can be viewed at: https://tensorboard.dev/experiment/ilMEHBPqQv6Tqh5DWC2SKw/

In [None]:
# ! rm -r lightning_logs/

In [None]:
# ! tensorboard dev upload --logdir lightning_logs \
#     --name "END2 07_Seq2Seq WikiQA - Satyajit" \
#     --description "Experiments on Seq2Seq Architecture with WikiQA Dataset"

TPU Experiment

In [None]:
# ! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl\
# ! pip install pytorch-lightning