<div class="alert alert-info">
    <h1 align="center"> Sequence-to-Sequence: Translation</h1>
    <h3 align="center">Deep Learning in Python (HamYad Lab.)</h3>
    <h5 align="center"><a href="http://www.snrazavi.ir">Seyed Naser RAZAVI</a></h5>
</div>

## Introduction

<img src="imgs/Seq2Seq-arch.png" width="90%"/>

### Some Applications

- Machine Translations
- Chatbots
- Question Answering
- Intelligent Word Processors
- Speech Recognition

## Libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import re
import random
from collections import Counter, OrderedDict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

from utils import *
from data_utils import *
from train_utils import *
from wv import WordVector

# setup
use_cuda = torch.cuda.is_available()

# debugger
from IPython.core.debugger import Pdb
pdb = Pdb()

## Data

- A parallel corpus (CSV file) containg pairs of sentences seperated by `<TAB>`.
- Every pair contains an English sentence and its corresponding translation in Persian.

<img src="imgs/en2fa_corpus.png" width="50%"/>

In [None]:
lang1 = 'en'
lang2 = 'fa'

MAX_LENGTH = 10
MAX_VOCAB = 30000
MIN_COUNT = 3

PAD, UNK, SOS, EOS = 0, 1, 2, 3

### Vocabulary
- Vocabulary object is the responsible object for tokenization and **numericaliztion**.
- There is one vocabulary object for each source and target languages.

In [None]:
class Vocabulary(object):
    
    SPECIAL_TOKENS = {'<PAD>': PAD, '<UNK>': UNK, '<SOS>': SOS, '<EOS>': EOS}
    
    def __init__(self, name, counter, min_count=3, max_vocab=30000):
        self.lang_name = name        
        self.word2count = OrderedDict(
            [(w, c) for (w, c) in counter.most_common(max_vocab) if c >= min_count])
        self.word2index = dict([(w, i+4) for  i, (w, _) in enumerate(self.word2count.items())])
        self.index2word = dict([(i+4, w) for  i, (w, _) in enumerate(self.word2count.items())])
        
        for word, index in self.SPECIAL_TOKENS.items():
            self.word2index[word] = index
            self.index2word[index] = word
            
    def wtoi(self, word):
        return self.word2index.get(word, self.SPECIAL_TOKENS['<UNK>'])
        
    def itow(self, index):
        return self.index2word.get(index, -1)
    
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return self.itow(key)
        else:
            return self.wtoi(key)
            
    def __len__(self):
        return len(self.word2index)

### Load data

In [None]:
def prepare_data(data_dir, lang1="en", lang2="fa", min_count=3, max_vocab=30000, reverse=False):
    sentence_pairs = load_corpus(data_dir, lang1, lang2, reverse)   
    sentence_pairs = filter_pairs(sentence_pairs, MAX_LENGTH)
    print("{} sentence pairs selected.".format(len(sentence_pairs)))
    
    print("\nBuilding vocabularies for source and target language...")
    src_counter, tgt_counter = Counter(), Counter()
    for (src_sent, tgt_sent) in sentence_pairs:
        src_counter.update(src_sent.split(' '))
        tgt_counter.update(tgt_sent.split(' '))
        
    if reverse:
        lang1, lang2 = lang2, lang1
        
    src_vocab = Vocabulary(lang1, src_counter, min_count, max_vocab)
    tgt_vocab = Vocabulary(lang2, tgt_counter, min_count, max_vocab)
            
    print("Number of words in each language:")
    print(" - [{}]: {}".format(src_vocab.lang_name, len(src_vocab)))
    print(" - [{}]: {}".format(tgt_vocab.lang_name, len(tgt_vocab)))
    
    return src_vocab, tgt_vocab, sentence_pairs

In [None]:
src_vocab, tgt_vocab, sentence_pairs = prepare_data('data', lang1, lang2, MIN_COUNT, MAX_VOCAB)

In [None]:
src_sent, tgt_sent = random.choice(sentence_pairs)
print(src_sent)
print(tgt_sent)

### Split data

In [None]:
trn_sentence_pairs, val_sentence_pairs = split(sentence_pairs, split_ratio=0.2)
print(len(trn_sentence_pairs), len(val_sentence_pairs))

### Dataset Class

- Contains source and target sentences
- Provides `(x, y)` pairs for `Dataloader`.

In [None]:
class Seq2SeqDataset(Dataset):
    
    def __init__(self, sentence_pairs, src_vocab, tgt_vocab):
        
        self.sentence_pairs = sentence_pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        
        # Numerialize all sentences in both src and tgt languages
        self.src_ids = [self.encode(x, src_vocab) for x, _ in sentence_pairs]
        self.tgt_ids = [self.encode(y, tgt_vocab) for _, y in sentence_pairs]
    
    def __getitem__(self, index):
        "returns an (x, y) pair"
        x = self.src_ids[index]
        y = self.tgt_ids[index]
        return x, y
    
    def encode(self, sentence, vocab):
        "Converts an input sentence to token ids."
        ids = [vocab.wtoi(token) for token in sentence.split(' ')] + [EOS]
        return ids
    
    def decode(self, encoded_sentence, vocab):
        "Convert back from token ids to the original sentence."
        return ' '.join([vocab.itow(i) for i in encoded_sentence[:-1]])

    def __len__(self):
        return len(self.sentence_pairs)

In [None]:
train_ds = Seq2SeqDataset(trn_sentence_pairs, src_vocab, tgt_vocab)
valid_ds = Seq2SeqDataset(val_sentence_pairs, src_vocab, tgt_vocab)

print(len(train_ds), len(valid_ds))

In [None]:
# select a random (x, y) pair from training data
idx = random.randint(0, len(train_ds) - 1)
x, y = train_ds[idx]

# print source sentence and its tensor
print(x)
print(train_ds.decode(x, src_vocab))
print()

# print target sentence and its tensor
print(y)
print(train_ds.decode(y, tgt_vocab))

### Dataloader

- Here, we have implemented a custom function named `collate_fn()`.
- The job of this function is to merge a list of samples into to a tensor

In [None]:
batch_size = 128
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=True)

In [None]:
X, Y, lengths = next(iter(train_dl))
print(X.size())
print(Y.size())
print(lengths[:20])

### Load pre-trained word wectors

In [None]:
src_wvecs_filename = f'data/wiki.{lang1}.pkl'
tgt_wvecs_filename = f'data/wiki.{lang2}.pkl'

src_wvecs = WordVector(lang1, src_wvecs_filename)
tgt_wvecs = WordVector(lang2, tgt_wvecs_filename)

print(len(src_wvecs), len(tgt_wvecs))

In [None]:
def get_embeddings(vocab, wv):
    num_vocabs, embed_sz = len(vocab), wv.vector_size
    emb = torch.zeros(num_vocabs, embed_sz)
    for idx in tqdm_notebook(range(num_vocabs)):
        emb[idx] = torch.from_numpy(wv[vocab[idx]])
    return emb

In [None]:
src_embeddings = get_embeddings(src_vocab, src_wvecs)

In [None]:
tgt_embeddings = get_embeddings(tgt_vocab, tgt_wvecs)

In [None]:
idx = src_vocab['the']
wv1 = src_embeddings[idx]
wv2 = torch.from_numpy(src_wvecs['the']).float()
print((wv1 == wv2).all())

## Sequence To Sequence Model

<img src='imgs/Seq2Seq-arch.png' width='90%'/>

### Encoder

<img src='imgs/enc.png' width='90%'/>

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers=1, 
                 ndir=2, dropout_emb=0, dropout_rnn=0, dropout_ctx=0, emb=None):
        super(EncoderRNN, self).__init__()
        self.embed_size = embed_size
        assert hidden_size % 2 == 0, 'Encoder size must be divisible by 2.'
        self.hidden_size = hidden_size // ndir
        self.num_layers = num_layers
        self.ndir = ndir
        self.dropout_emb = dropout_emb
        self.dropout_ctx = dropout_ctx
        bidir = (ndir == 2)
        
        if dropout_emb > 0:
            self.emb_dropout = nn.Dropout(dropout_emb)
        if dropout_ctx > 0:
            self.ctx_dropout = nn.Dropout(dropout_ctx)
            
        if emb is not None:
            assert emb.size(0) == input_size, 'Invalid embeddings!'
            self.embed_size = emb.size(1)        
            
        self.embedding = nn.Embedding(input_size, self.embed_size, padding_idx=0)
        self.gru = nn.GRU(self.embed_size, self.hidden_size, num_layers, 
                          dropout=dropout_rnn, bidirectional=bidir)
        
        if emb is not None:
            self.embedding.weight.data = emb
    
    def forward(self, inputs, src_lenghts, hidden):
        mask = (inputs != 0).float()
        
        # Embedding
        embed = self.embedding(inputs)
        if self.dropout_emb > 0:
            embed = self.emb_dropout(embed)
        
        # Recurrent layer
        pack_embed = pack(embed, src_lenghts)  # pack padded sequence
        output, hidden = self.gru(pack_embed, hidden)
        output = unpack(output)[0]             # pad packed sequence
        if self.dropout_ctx > 0:
            output = self.ctx_dropout(output)
            
        return output, hidden, mask
    
    def init_hidden(self, batch_size):
        return to_var(torch.zeros(self.ndir * self.num_layers, batch_size, self.hidden_size))

### Attention Decoder

#### Attention:
Focus on different part of the encoder's output for every step of the decoder's own ouput.
1. Calculate a set of `attention weights`.
2. Compute a weighted sum of encoder outputs.

<img src='imgs/attn-full.png' width='90%'/>

#### Calculating attention weights
Attention weights are computed with another feed-forward layer, using:
- hidden state of decoder, and
- the decoder's input.

In [None]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size, context_size, attn_type='mlp'):
        super(Attention, self).__init__()
        
        assert attn_type in ['dot', 'mlp'], f'Unknown attention type {attn_type}'
        
        if attn_type == 'mlp':
            self.mlp = nn.Linear(context_size, 1, bias=False)
            self.forward = self.mlp_forward
        else:
            self.forward = self.dot_forward
        
        self.hid2ctx = nn.Linear(hidden_size,  context_size, bias=False)
        self.ctx2ctx = nn.Linear(context_size, context_size, bias=False)
        self.ctx2hid = nn.Linear(context_size, hidden_size,  bias=False)
        
    def dot_forward(self, hidden, context, mask):
        """ Dot-Product attention.
           Inputs:
              - hidden: decoder current output (1, bs, hs)
              - context: encoder outputs       (sl, bs, cs)
        """
        context_ = self.ctx2ctx(context).permute(1, 2, 0)  # (sl, bs, cs) -> (bs, cs, sl)
        hidden_  = self.hid2ctx(hidden).permute(1, 0, 2)   # (1,  bs, hs) -> (bs,  1, hs)
        
        # dot product
        scores = F.softmax(torch.bmm(hidden_, context_), dim=-1)  # (bs, 1, sl)
        output = self.ctx2hid(torch.bmm(scores, context.transpose(0, 1)))   # (bs, 1, hs)
        
        return scores.transpose(0, 1), output.transpose(0, 1)

    def mlp_forward(self, hidden, context, mask):
        """ Dot-Product attention.
           Inputs:
              - hidden: decoder current output (1,  bs, hs)
              - context: encoder outputs       (sl, bs, cs)
        """
        context_ = self.ctx2ctx(context)
        hidden_ = self.hid2ctx(hidden)
        
        # scores
        scores = self.mlp(F.tanh(context_ + hidden_)).squeeze(-1)
        
        # normalize attention scores
        alpha = (scores - scores.max(0)[0]).exp().mul(mask)
        alpha = alpha / alpha.sum(0)
        
        output = self.ctx2hid((alpha.unsqueeze(-1) * context).sum(0))
        
        return alpha, output

In [None]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, embed_size, hidden_size, output_size, context_size, num_layers=1, 
                 tie_weights=False, attn_type='mlp', dropout_out=0, emb=None):
        
        super(AttnDecoderRNN, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.context_size = context_size
        self.tie_weights = tie_weights
        self.attn_type = attn_type
        self.dropout_out = dropout_out
        
        if emb is not None:
            assert emb.size(0) == output_size, 'Invalid embeddings!'
            self.embed_size = emb.size(1)            
        
        self.embedding = nn.Embedding(output_size, self.embed_size, padding_idx=0)
        self.gru = nn.GRU(self.embed_size, hidden_size, num_layers, dropout=0.3)
        self.attention = Attention(hidden_size, context_size, attn_type=attn_type)
        self.hid2emb = nn.Linear(hidden_size, self.embed_size)
        self.emb2out = nn.Linear(self.embed_size, output_size)
        
        if dropout_out > 0:
            self.out_dropout = nn.Dropout(dropout_out)
            
        if emb is not None:
            self.embedding.weight.data = emb
        
        if tie_weights:
            self.emb2out.weight.data = self.embedding.weight.data
            
    def forward(self, inp, hidden, context, mask):
        # Embedding
        embedded = self.embedding(inp).view(1, hidden.size(1), -1)
        
        # Recurrent layer
        gru_out, hidden = self.gru(embedded, hidden)
        
        # Attention
        attn, output = self.attention(gru_out, context, mask)
        
        # from hidden to embeddings
        output = F.tanh(self.hid2emb(output))
        if self.dropout_out > 0:
            output = self.out_dropout(output)
        
        # Classifier
        output = self.emb2out(output)
        output = F.log_softmax(output, dim=-1)
        return output, hidden, attn

    def init_hidden(self, batch_size):
        return to_var(torch.zeros(self.num_layers, batch_size, self.hidden_size))

<img src='imgs/attn.png' width='90%'/>

```python 
# dot product attention
scores = F.softmax(torch.bmm(hidden_, context_), dim=-1)          # (bs, 1, sl)
output = self.ctx2hid(torch.bmm(scores, context.transpose(0, 1))) # (bs, 1, hs)
```


## Seq2Seq Model with Attention

In [None]:
class Seq2SeqAttn(nn.Module):
    def __init__(self, input_size, output_size, enc_embed_size, dec_embed_size,
                 enc_hidden_size, dec_hidden_size, enc_num_layers, dec_num_layers, 
                 ndir=2, tie_weights=False, attn_type='mlp', dropout_emb=0, 
                 dropout_rnn=0, dropout_ctx=0, dropout_out=0, enc_emb=None, dec_emb=None):
        
        super(Seq2SeqAttn, self).__init__()
        self.encoder = EncoderRNN(input_size, enc_embed_size, enc_hidden_size, enc_num_layers, ndir, 
                                  dropout_emb, dropout_rnn, dropout_ctx, enc_emb)
        self.enc2dec = nn.Linear(enc_hidden_size, dec_num_layers * dec_hidden_size, bias=True)
        self.decoder = AttnDecoderRNN(dec_embed_size, dec_hidden_size, output_size, enc_hidden_size,
                                      dec_num_layers, tie_weights, attn_type, dropout_out, dec_emb)
                
    def forward(self, inputs, src_lengths, tgt_var=None, teacher_forcing_ratio=0.0):
        bs = inputs.size(1)
        
        # Encoder
        hidden = self.encoder.init_hidden(bs)
        context, _, mask = self.encoder(inputs, src_lengths, hidden)
        
        # Encoder 2 Decoder
        nl = self.encoder.num_layers
        hidden = hidden.view(nl, ndir, bs, -1).permute(0, 2, 1, 3).contiguous().view(nl, bs, -1)
        hidden = F.tanh(self.enc2dec(hidden))        
        
#         # Encoder 2 Decoder
#         hidden = F.tanh(self.enc2dec(context.sum(0) / mask.sum(0).unsqueeze(1)))  # (bs, ndir*he) -> (bs, nl * hd)
#         nl, hs = self.decoder.num_layers, self.decoder.hidden_size
#         hidden = hidden.view(bs, nl, hs).transpose(0, 1).contiguous()

        # Decoder
        dec_input = to_var(torch.LongTensor([[SOS] * bs]))
        dec_outputs, attns = [], []

        if (tgt_var is not None):
            tgt_len = tgt_var.size(0)
        else:
            tgt_len = src_lengths[0]
            
        for i in range(tgt_len):
            dec_output, hidden, attn = self.decoder(dec_input, hidden, context, mask)
            
            dec_outputs += [dec_output]
            attns += [attn]
            
            # next input to decoder
            if (tgt_var is not None) and (random.random() < teacher_forcing_ratio):
                dec_input = to_var(tgt_var.data[i])
            else:
                topi = dec_output.data.topk(1, dim=1)[1]
                dec_input = to_var(topi.transpose(0, 1))  # (1, bs) -> (bs, 1)
            
            if torch.sum(dec_input == EOS).data[0] + torch.sum(dec_input == PAD).data[0] == bs:
                break
                        
        return torch.stack(dec_outputs), torch.stack(attns)
    
    def freeze_embeddings(self):
            for p in self.encoder.embedding.parameters():
                p.requires_grad = False
            for p in self.decoder.embedding.parameters():
                p.requires_grad = False
                
    def unfreeze_embeddings(self):
            for p in self.encoder.embedding.parameters():
                p.requires_grad = True
            for p in self.decoder.embedding.parameters():
                p.requires_grad = True

<img src="imgs/attention_paper.png" width="75%"/>

## Training The Model

To train we run the input sentence through the encoder, and keep track of every output and the latest hidden state.

Decoder's inputs:
- The decoder is given the `<SOS>` token as its first input, and 
- The last hidden state of the encoder as its first hidden state.

### Teacher forcing
- “Teacher forcing” is the concept of using the real target outputs as each next input, instead of using the decoder’s guess as the next input. 
- Using teacher forcing causes it to converge faster but when the trained network is exploited, it may exhibit instability.
- So, the best strategy is to use teacher forcing **ocassionally**!

In [None]:
def save_model(model_dir, model, val_loss):
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    torch.save(model.state_dict(), f'{model_dir}/seq2seq-{val_loss:.2f}.pth')

def load_model(model_dir, input_size, enc_emb_sz, dec_emb_size, enc_hidden_size, dec_hidden_size, output_size, num_layers):
    model = Seq2SeqAttn(input_size, output_size, enc_emb_sz, dec_emb_size, enc_hidden_size, dec_hidden_size, num_layers, num_layers)
    model.load_state_dict(torch.load(f'{model_dir}/seq2seq.pth'))
    return model.cuda() if use_cuda else model

In [None]:
def train(model, train_dl, valid_dl, optimizer, scheduler, criterion, start=0, num_epochs=10, save_to='.'):
    best_val = float('inf')
    best_weights = None
    plot_losses = []    
    for epoch in range(start, start + num_epochs):
#         tfr = max(1.0 - 0.1 * epoch, 0) / 2.0 # adjust techer forcing ratio
        tfr = 0.0
        trn_loss = train_step(model, train_dl, optimizer, criterion, tfr, epoch, start + num_epochs)        
        val_loss = validate_step(model, valid_dl, criterion, epoch, start + num_epochs)           
        plot_losses.append((trn_loss, val_loss))
        scheduler.step()
        
        if val_loss < best_val:
            best_val = val_loss
            best_weights = model.state_dict().copy()
            save_model(model_dir, model, val_loss)
            
    model.load_state_dict(best_weights)
    show_plot(plot_losses)

## Run

In [None]:
# model hyper-parameters
enc_hidden_size, dec_hidden_size = 256, 256
enc_embed_size, dec_embed_size = 300, 300
enc_emb, dec_emb = src_embeddings, tgt_embeddings
enc_num_layers, dec_num_layers = 4, 4
ndir = 2
model_dir = f'models-{enc_hidden_size}-{dec_hidden_size}-{num_layers}'

# build the model
model = Seq2SeqAttn(len(src_vocab), len(tgt_vocab), enc_embed_size, dec_embed_size,
                    enc_hidden_size, dec_hidden_size, num_layers, num_layers, ndir=2,
                    tie_weights=True, attn_type='mlp', dropout_emb=0.2, dropout_rnn=.25,
                    dropout_ctx=0.2, dropout_out=0.2, enc_emb=enc_emb, dec_emb=dec_emb)

# loss function and optimizer
criterion = nn.NLLLoss()

if use_cuda:
    model.cuda()
    criterion.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.002)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)

In [None]:
%%time
train(model, train_dl, valid_dl, optimizer, scheduler, criterion, start=0, num_epochs=10, save_to=model_dir)

### Resume training

In [None]:
model = load_model(model_dir, len(src_vocab), 
                   enc_embed_size, dec_embed_size,
                   enc_hidden_size, dec_hidden_size, 
                   len(tgt_vocab), num_layers)

optimizer = optim.SGD(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.95)

In [None]:
train(model, train_dl, valid_dl, optimizer, scheduler, criterion, start=10, num_epochs=10)

In [None]:
# save_model(model_dir, model)

## Translation

In [None]:
def translate(model, src_var, lengths): 
    src_len, bs = src_var.size()[:2]
    assert bs == 1, print('Batch size in translate must be 1.')
    outputs, attentions = model(src_var, lengths)
    preds = torch.max(outputs.data.squeeze(1), dim=1)[1]
    
    translation = []
    for pred_id in preds:
        translation += [tgt_vocab.itow(pred_id)]
        if pred_id == EOS: break
    
    translation = ' '.join(translation[:-1])  
    return translation, attentions

### Random evaluation

In [None]:
def evaluate_randomly(model, dl, n=10):
    model.encoder.eval()
    model.decoder.eval()
    
    for i, (src, tgt, lengths) in enumerate(dl):
        if i >= n: break
        print('>', dl.dataset.decode(src.squeeze(1), src_vocab))
        print('=', dl.dataset.decode(tgt.squeeze(1), tgt_vocab))
        
        # translate src sentence
        translation, attentions = translate(model, to_var(src, volatile=True), lengths)
        if tgt_vocab.lang_name == 'fa':
            translation = translation.replace('<UNK>', '<؟>')
        print('<', translation)
        print('')

In [None]:
val_dl = DataLoader(valid_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)
evaluate_randomly(model, val_dl)

## Possible Improvements (Data, Model, Algorithm)

- Larger corpus
- Beam Search
- Pre-trained word vectors (`fasttext` or `GloVe`)
- Ensemble decoding
- Handling rare words