<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Seq2Seq_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequence to Sequence Machine Translation [TUT]

We will be writing an encoder-decoder model to try to Machine Translate with help of NLP and Pytorch \\
Paper: https://arxiv.org/abs/1409.3215 \\
Tutorial Source: https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

In [0]:
%%capture
!pip install -U tqdm
!python -m spacy download de
# Restart Runtime after this for tqdm to be updated

In [0]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


import torchtext.data as data
from torchtext.datasets import TranslationDataset, Multi30k

import os
import spacy
import math
import random
from tqdm.notebook import tqdm

## Seeding
For duplication of results

In [0]:
def seed_all(seed=1234):
    """Seed the results for duplication"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 1234
seed_all(SEED)

## PreProcessing

### Tokenizing

In [0]:
spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

In [0]:
def tokenize_en(sentence):
    return [word.text for word in spacy_en.tokenizer(sentence)]

def tokenize_de(sentence):
    return [word.text for word in spacy_de.tokenizer(sentence)][::-1]

### Data Loaders
Create two Field Texts for Source and Desitnation

In [0]:
source = data.Field(tokenize=tokenize_de,
               init_token='<sos>',
               eos_token='<eos>',
               lower=True)

destination = data.Field(tokenize=tokenize_en,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)

In [7]:
%%time
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(source, destination))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 502kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 176kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 164kB/s]


CPU times: user 18.5 s, sys: 134 ms, total: 18.6 s
Wall time: 24.3 s


In [8]:
len(train_data), len(valid_data), len(test_data)

(29000, 1014, 1000)

In [0]:
source.build_vocab(train_data, min_freq=2)
destination.build_vocab(train_data, min_freq=2)

#### Set up Device, CPU or GPU
To put Iterator onto that device

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data,valid_data, test_data),
                                                                           batch_size=BATCH_SIZE, 
                                                                           device=device)

## Building the Model

For starters this is uni-directional

#### Encoder

In [0]:
class Encoder(nn.Module):
    """Encoder RNN for the Seq2Seq Model"""

    def __init__(self, input_dim, embedding_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()

        self.n_layers = n_layers
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, source_sentence):
        embedding  = self.embedding(source_sentence)
        outputs, (hidden, cell) = self.lstm(embedding)
        return hidden, cell

#### Decoder

In [0]:
class Decoder(nn.Module):
    """Decoder RNN for the Seq2Seq Model"""
    
    def __init__(self, output_dim, embedding_dim, hid_dim, n_layers, dropout):
        super(Decoder, self).__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


#### Seq2Seq

In [0]:
class Seq2Seq(nn.Module):
    """Final Sequence to Sequence Model"""

    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, "Hidden Dimensions are not equal"
        assert encoder.n_layers == decoder.n_layers, "Number of Layers are different"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # Storage tensor
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_forcing = random.random() < teacher_forcing_ratio

            highest_predicted = torch.argmax(output, dim=1)

            input = trg[t] if teacher_forcing else highest_predicted
        
        return outputs

### Training

#### HyperParameters and Model Initialization

In [0]:
INPUT_DIM = len(source.vocab)
OUTPUT_DIM = len(destination.vocab)
ENCODING_EMBEDDING_DIM = 256
DECODING_EMBEDDING_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5


encoder = Encoder(INPUT_DIM, ENCODING_EMBEDDING_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DECODING_EMBEDDING_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)

Initializing weights from a normal distribution of -0.08 to 0.08

In [61]:
def init_weights(model):
        for name, parameters in model.named_parameters():
            nn.init.uniform_(parameters.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
  )
)

In [62]:
print('Total Parametrs : {:,}'.format(sum(p.numel() for p in model.parameters())))

Total Parametrs : 13,899,013


#### Initialize Optimizer and tokens

In [0]:
optimizer = optim.Adam(model.parameters())

In [64]:
DST_PAD_TKN = destination.vocab.stoi[destination.pad_token]
print(DST_PAD_TKN)

1


In [0]:
criterion = nn.CrossEntropyLoss(ignore_index=DST_PAD_TKN)

## Training

At each iteration

*   get the source and target sentences from the batch, $X$ and $Y$
zero the gradients calculated from the last batch
*   feed the source and target into the model to get the output, $\hat{Y}$
*   as the loss function only works on 2d inputs with 1d targets we need to flatten each of them with .view
    *   we slice off the first column of the output and target tensors 
*   calculate the gradients with loss.backward()
*   clip the gradients to prevent them from exploding (a common issue in RNNs)
*   update the parameters of our model by doing an optimizer step
*   sum the loss value to a running total




#### Training Process

In [0]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()
    epoch_loss = 0.0

    for i, batch in tqdm(enumerate(iterator), total=len(iterator), leave=False):
        source_sentence, destination_sentence = batch.src, batch.trg
        optimizer.zero_grad()

        output = model(source_sentence, destination_sentence)
        if i == 0:
            print(output.shape)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        destination_sentence = destination_sentence[1:].view(-1)

        loss = criterion(output, destination_sentence)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss/len(iterator)


#### Evaluation Process

In [0]:
def evaluate(mode, iterator, criterion):
    model.eval()
    epoch_loss = 0.0

    for i, batch in tqdm(enumerate(iterator), total=len(iterator), leave=False):
        source_sentence, destination_sentence = batch.src, batch.trg
        
        output = model(source_sentence, destination_sentence)

        output_shape = output.shape[-1]

        output = output[1:].view(-1, output_shape)
        destination_sentence = destination_sentence[1:].view(-1)
        loss = criterion(output, destination_sentence)

        epoch_loss += loss
    
    return epoch_loss/len(iterator)

#### Training

In [68]:
EPOCHS = 10
CLIP = 1

for epoch in range(EPOCHS):

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    validation_loss = evaluate(model, valid_iterator, criterion)

    print('EPOCH : {:02} | Train Loss: {:.4f}  Validation Loss: {:.4f}'.format(epoch + 1, train_loss, validation_loss))

HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([35, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 01 | Train Loss: 5.5082  Validation Loss: 4.8954


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([29, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 02 | Train Loss: 4.8482  Validation Loss: 4.5741


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([28, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 03 | Train Loss: 4.6823  Validation Loss: 4.5831


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([34, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 04 | Train Loss: 4.6187  Validation Loss: 4.5491


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([31, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 05 | Train Loss: 4.5814  Validation Loss: 4.4663


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([24, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 06 | Train Loss: 4.5250  Validation Loss: 4.4918


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([32, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 07 | Train Loss: 4.5455  Validation Loss: 4.4351


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([32, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 08 | Train Loss: 4.5075  Validation Loss: 4.4314


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([32, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 09 | Train Loss: 4.4965  Validation Loss: 4.4486


HBox(children=(FloatProgress(value=0.0, max=227.0), HTML(value='')))

torch.Size([37, 128, 5893])


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

EPOCH : 10 | Train Loss: 4.4822  Validation Loss: 4.5766


## Testing

In [70]:
test_loss = evaluate(model, test_iterator, criterion)

print('Train Loss: {:.4f}  Validation Loss: {:.4f}'.format(train_loss, validation_loss))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Train Loss: 4.4822  Validation Loss: 4.5766
