# Sequence-to-Sequence Model (Seq2Seq)

# Download the dataset

In [1]:
!gdown --id '1r4px0i-NcrnXy1-tkBsIwvYwbWnxAhcg' --output data.tar.gz
!tar -zxvf data.tar.gz
!mkdir ckpt
!ls

'gdown' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC
tar: Error opening archive: Failed to open 'data.tar.gz'
'ls' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC


# Import the libraries

In [None]:
%%capture
!pip3 install --user nltk

In [3]:
import numpy as np
import os
import re

import torch.utils.data as data
import torch.nn as nn
import torch
from torch.nn import functional as F

## Define the Label Transform
- Make padding to the label if it's in different length.
- This operation is required for training.

In [None]:
class LabelTransform(object):
  def __init__(self, size, pad):
    self.size = size
    self.pad = pad

  def __call__(self, label):
    label = np.pad(label, (0, (self.size - label.shape[0])), mode='constant', constant_values=self.pad)
    return label

In [None]:
class class1(object):
  def __init__(self, size):
    print(size)
  def __call__(self, label):
    print('111')
    print(label)

class1(1)

In [None]:
a = class1(1)
a(2)

# Define the Dataset

In [None]:
class ENG2CNDataset(data.Dataset):
  def __init__(self, root, max_output_len, set_name):
    """
    root (string): the root directory of the dataset & dictionary.
    max_output_len (int): the max output length of a sentence.
    set_name (string): the dataset to be read. i.e. English or Chinese
    """
    self.root = root
    self.word2int_cn, self.int2word_cn = self.get_dictionary('cn')
    self.word2int_en, self.int2word_en = self.get_dictionary('en')

    # load the dataset
    self.data = []
    with open(os.path.join(self.root, f'{set_name}.txt'), 'r') as f:
      for line in f:
        self.data.append(line)
    print(f'{set_name} dataset size: {len(self.data)}') # print the size of dataset

    # get the dictionary size
    self.cn_vocab_size = len(self.word2int_cn)
    self.en_vocab_size = len(self.word2int_en)
    # let the LabelTransform class instance behaves like a function
    # for more info, plz refer to: https://www.geeksforgeeks.org/__call__-in-python/
    self.transform = LabelTransform(max_output_len, self.word2int_en['<PAD>'])

  def get_dictionary(self, language):
    """
    Get the dictionary of the word2int and int2word for English and Chinese.
    language (string): the language of the dictionary.
    """
    with open(os.path.join(self.root, f'word2int_{language}.json'), 'r') as f:
      word2int = json.load(f)
    with open(os.path.join(self.root, f'int2word_{language}.json'), 'r') as f:
      int2word = json.load(f)
    return word2int, int2word

  def __len__(self):
    return len(self.data)

  def __getitem__(self, Index):
    # separate the Chinese and English sentence
    # i.e separate the input and the label
    # e.g. he is a teacher . 	他 是 老師 。 
    # --> ['he is a teacher . ', '他 是 老師 。 ', ''] # len = 3
    # --> ['he is a teacher . ', '他 是 老師 。 '] # len = 2
    sentences = self.data[Index]
    sentences = re.split('[\t\n]', sentences)
    sentences = list(filter(None, sentences))
    assert len(sentences) == 2

    # prepare the special word
    BOS = self.word2int_en['<BOS>']
    EOS = self.word2int_en['<EOS>']
    UNK = self.word2int_en['<UNK>']

    # add <BOS> to the begining and <EOS> to the end
    # for the unknown word, replace it with <UNK>
    en, cn = [BOS], [EOS]
    # separate the EACH sentence into subwords by English and Chinese separately
    # e.g. ['he is a teacher . ', '他 是 老師 。 ']
    # --> ['he', 'is', 'a', 'teacher', '.', ''] # len = 6
    # --> ['he', 'is', 'a', 'teacher', '.'] # len = 5
    # --> [1, 12, 11, 9, 215, 4, 2]
    sentence = re.split(' ', sentences[0])
    sentence = list(filter(None, sentence))
    for word in sentence:
      en.append(self.word2int_en.get(word, UNK))
    en.append(EOS)

    # Do the same for Chinese sentence
    sentence = re.split(' ', sentences[1])
    sentence = list(filter(None, sentence))
    for word in sentence:
      cn.append(self.word2int_cn.get(word, UNK))
    cn.append(EOS)

    en, cn = np.asarray(en), np.asarray(cn)

    # make padding to the sentence to desired length
    en, cn = self.transform(en), self.transform(cn)
    en, cn = torch.LongTensor(en), torch.LongTensor(cn)

    return en, cn

# Model Structure

## Encoder
- It's a RNN model as an Encoder.
- For each input, **Encoder** will output **a vector** and **a hidden state**, and use the hidden state for the next input.
- In other words, the **Encoder**  will read the input sequence step by step, and give a single vector as output (final hidden state).

In [None]:
class Encoder(nn.Module):
  """
  input: the sentense that has been represented by integer sequence. i.e. [1, 12, 11, 9, 215, 4, 2]
  output: 
    - the upper most layer output of RNN --> for attention
    - the hidden state of each layer --> feed as the input for decoder
  """
  def __init__(self, en_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
    """
    en_vocab_size: the size of the dictionary. i.e. the # of the subwords
    emb_dim: the dimension of embedding that is transformed from each word
    enc_hid_dim: the dimension of encoder hidden layer and hidden state
    dec_hid_dim: the dimension of decoder hidden state
    n_layers: the # of layers in RNN
    dropout: the percentage of dropout rate
    """
    super().__init__()
    self.embedding = nn.Embedding(en_vocab_size, emb_dim)
    self.enc_hid_dim = enc_hid_dim # comment?
    self.dec_hid_dim = dec_hid_dim # comment?
    self.n_layers = n_layers # comment?
    self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)

  def forward(self, input):
    # input shape: [batch_size, sentence_length, vocab size]
    embedding = self.embedding(input) # learnable
    outputs, hidden = self.rnn(self.dropout(embedding))
    # outputs shape: [batch_size, sentence_length, hid_dim * directions]
    # hidden shape: [num_layers * directions, batch_size, hid_dim]
    # outputs is the most upper layer output
    batch_size = outputs.shape[0]
    s = hidden.view(self.n_layers, 2, batch_size, -1) # s is the hidden states from Encoder
    # s shape: [layers, directions, batch_size, enc_hid_dim]
    s = torch.cat((s[-1, -2, :, :], s[-1, -1, :, :]), dim=1) # concatenate the 2 hidden states from the bidirectional RNN from Encoder
    s = torch.tanh(self.fc(s)) # shape: [batch size, dec_hid_dim]

    return outputs, s, hidden

# Attention

In [9]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        """
        enc_hid_dim: the dimension of encoder hidden layer and hidden state
        dec_hid_dim: the dimension of decoder hidden state
        """
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, s, enc_output):
        """
        hidden: the hidden layer of encoder # hidden shape: [batch size, num_layers * directions, enc_hid_dim]
        enc_output: the output of encoder
        max_output_len: the sentence length
        """
        # enc_output shape: [batch_size, sentence_length, hid_dim * directions] = [batch_size, sentence_length, hid_dim * 2]
        batch_size = enc_output.shape[0]
        sentence_len = enc_output.shape[1]
        # s shape: [batch_size, dec_hid_dim]
        # repeat decoder hidden state for 'sentence_length' times
        # s_new shape: [batch_size, sentence_length, dec_hid_dim]
        s_new = s.unsqueeze(1).repeat(1, sentence_len, 1)
        # energy shape: [batch_size, sentence_length, dec_hid_dim]
        energy = self.attn(torch.cat((s_new, enc_output), dim = 2)) # concatenate along the dimension of 'dec_hid_dim'

        # attention shape: [batch_size, sentence_length]
        attention = self.v(energy).squeeze()

        return F.softmax(attention, dim = 1)

In [7]:
tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(tensor)
print(tensor.shape)
print()
print(tensor.unsqueeze(1))
print(tensor.unsqueeze(1).shape)
print()
print(tensor.unsqueeze(1).repeat(1, 2, 1))
print(tensor.unsqueeze(1).repeat(1, 2, 1).shape)
print()

tensor([[1, 2, 3],
        [4, 5, 6]])
torch.Size([2, 3])

tensor([[[1, 2, 3]],

        [[4, 5, 6]]])
torch.Size([2, 1, 3])

tensor([[[1, 2, 3],
         [1, 2, 3]],

        [[4, 5, 6],
         [4, 5, 6]]])
torch.Size([2, 2, 3])



# Decoder
- Introduction
    - Decoder is built by RNN model.
    - For the Decoder in the most simple Seq2Seq model, we only use the **LAST** hidden output from Encoder. This hidden output is also called "Content Vector".
    - The Content Vector can be seen as the encoded vector based on what it has read from the previos context.
    - The Content Vector is used as the initial condition for Decoder.
    - For the Encoder outputs, they are used in the Attention mechanism.

- Inputs
    - The previos encoded word embedding represented as integer.
- Outputs
    - hidden: the updated status of the hidden state based on previos inputs and hidden state
    - output: the percentage of the current word output


In [None]:
class Decoder(nn.Module):
    def __init__(self, cn_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout, isatt):
        """
        en_vocab_size: the size of the dictionary. i.e. the # of the subwords
        emb_dim: the dimension of embedding that is transformed from each word
        enc_hid_dim: the dimension of encoder hidden layer and hidden state
        dec_hid_dim: the dimension of decoder hidden state
        n_layers: the # of layers in RNN
        dropout: the percentage of dropout rate
        isatt: determine if using Attention or notq
        """
        super().__init__()
        self.cn_vocab_size = cn_vocab_size
        self.hid_dim = dec_hid_dim
        self.n_layers = n_layers
        self.isatt =  isatt
        # the input dimension to the Decoder will be changed if using Attention
        self.input_dim = (enc_hid_dim * 2 + emb_dim) if isatt else emb_dim
        self.attention = Attention(enc_hid_dim, dec_hid_dim)
        self.embedding = Embedding(cn_vocab_size, emb_dim)
        self.rnn = nn.GRU(self.input_dim, self.dec_hid_dim, self.n_layers, dropout=dropout, batch_first=True)
        self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim*2)
        self.embedding2vocab2 = nn.Linear(self.hid_dim*2, self.hid_dim*4)
        self.embedding2vocab3 = nn.Linear(self.hid_dim*4, self.cn_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, dec_input, s, enc_output):
        """
        dec_input: the input to the decoder. [batch_size, vocab_size]
        s: the output vector from Attention. [batch_size, dec_hid_dim]
        enc_output: the output from encoder. [batch_size, sentence_length, enc_hid_dim * 2]
        """
        # the direction in decoder will only be single direction --> directions = 1
        dec_input = dec_input.unsqueeze(1)
        # embedded shape: [batch_size, 1, emb_dim]
        embedded = self.dropout(self.embedding(dec_input))
        if self.isatt:
            alpha = self.attention(s, enc_output) # [batch_size, sentence_length]
            alpha = alpha.unsqueeze(1) # [batch_size, 1, sentence_length]
            c = torch.bmm((alpha, enc_output), dim=2) # weighted-sum
            rnn_input = torch.cat((c, embedded), dim=2)
        else:
            rnn_input = embedded
        
        dec_output, dec_hidden = self.rnn(rnn_input, dec_hidden)
        # dec_output shape: [batch_size, 1, hid_dim]
        # dec_hidden shape: [num_layers, batch_size, hid_dim]

        # transform the word vector to the percentage of occurring
        # dec_output shape: [batch, hid_dim]
        dec_output = dec_output.squeeze()
        dec_output = self.embedding2vocab1(dec_output)
        dec_output = self.embedding2vocab2(dec_output)
        prediction = self.embedding2vocab3(dec_output) # [batch_size, vocab_size]
        return prediction, dec_hidden


# Seq2Seq
- Constructed by Encoder and Decoder.

In [4]:
class Seq2Seq(nn.Module):
    """
    encoder: the encoder model
    decoder: the decoder model
    device: CPU or GPU
    """
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        # encoder and decoder must have the equal number of layers
        assert encoder.n_layers == decoder.n_layers

    def forward(self, input, target, teacher_forcing_ratio):
        """
        input: the input sentence [batch_size, input len, vocab_size]
        target: the label (correct translated language) of the sentence [batch_size, target len, vocab_size]
        teacher_forcing_ratio: the percentage of using label for training
        """
        batch_size = target.shape[0]
        target_len = target.shape[1]
        vocab_size = self.decoder.cn_vocab_size

        # initiate a torch tensor for storing the answer
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
        # put the input to the encoder
        encoder_outputs, s, hidden = self.encoder(input)
        # get the <BOS> token
        input = target[:, 0] # the input for the decoder at the beginning
        preds = []
        
        # iteratively input the enc_output to the decoder
        for t in range(1, target_len):
            output, hidden = self.decoder(input, s, encoder_outputs)
            outputs[:, t] = output

            # initialize the teacherforcing ratio
            teacher_force = random.random() <= teacher_forcing_ratio

            # get the word with the maximum percentage
            top1 = output.argmax(1)

            # train with label if teacher forcing else using the predicted word
            # update the input for next timestamp
            input = targer[:, t] if teacher_force and t < target_len else top1
            preds = preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds


    def inference(self, input, target):
        """
        When testing, not using the label as input for decoder.
        input: the input sentence [batch_size, input len, vocab_size]
        target: the label (correct translated language) of the sentence [batch_size, target len, vocab_size]
        """
        batch_size = target.shape[0]
        target_len = target.shape[1]
        vocab_size = self.decoder.cn_vocab_size

        # initiate a torch tensor for storing the answer
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
        # put the input to the encoder
        encoder_outputs, s, hidden = self.encoder(input)
        # get the <BOS> token
        input = target[:, 0] # the input for the decoder at the beginning
        preds = []
        
        # iteratively input the enc_output to the decoder
        for t in range(1, target_len):
            output, hidden = self.decoder(input, s, encoder_outputs)
            outputs[:, t] = output

            # initialize the teacherforcing ratio
            teacher_force = random.random() <= teacher_forcing_ratio

            # get the word with the maximum percentage
            top1 = output.argmax(1)

            # train with label if teacher forcing else using the predicted word
            # update the input for next timestamp
            input = top1
            preds = preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds

# utils
- Basic operations
    - Save the model
    - Load the model
    - Construct the model
    - Transform the digit sequence to word sentence
    - Calculate the BELU score
    - Iterate through the DataLoader

## Save the model

In [None]:
def save_model(model, store_model_path, step):
    torch.save(model.state_dict(), f'{store_model_path}/model_{step}.ckpt')
    return

## Load the model

In [None]:
def load_model(model, load_model_path):
    print(f'Load the model from {load_model_path}')
    model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
    return model

## Construct the model

In [None]:
def build_model(config, en_vocab_size, cn_vocab_size):
    # build the model
    encoder = Encoder(en_vocab_size, config.emb_dim, config.enc_hid_dim, config.dec_hid_dim, config.n_layers, config.dropout)
    decoder = Decoder(cn_vocab_size, config.emb_dim, config.enc_hid_dim, config.dec_hid_dim, config.n_layers, config.dropout, config.attention)
    model = Seq2Seq(encoder, decoder, device)

    # define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    print(optimizer)
    if config.load_model:
        model = load_model(model, config.load_model_path)
    model = model.to(device)

    return model, optimizer

## Transform the digit sequence to word sentence

In [None]:
def tokens2sentence(outpus, int2word):
    sentences = []
    for tokens in outputs:
        sentence = []
        for token in tokens:
            word = int2word[str(int(token))]
            if word == '<EOS>':
                break
            sentence.append(word)
        sentences.append(sentence)
    return sentences

## Calculate the BELU score

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
def computebleu(sentences, targets):
    smooth = SmoothingFunction() # smooth the log function
    score = 0
    assert (len(sentences) == len(targets))

    def cut_token(sentence):
        """
        Cut the vocabulary into word.
        """
        tmp = []
        for token in sentence:
            if token == 'UNK' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1: # no need to cut if the token is digit or alphabet
                tmp.append(token)
            else:
                tmp += [word for word in token]
        return tmp
    
    for sentence, target in zip(sentences, targets):
        sentence = cut_token(sentence)
        target = cut_token(target)
        score += sentence_bleu([target], sentence, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function=smooth.method1)

    return score

## Iterate through the DataLoader

In [None]:
def inifite_iter(data_loader):
    it = iter(data_loader)
    while True:
        try:
            ret = next(it)
            yield ret
        except StopIteration:
            it = iter(data_loader)

# Training Process

## Training

In [None]:
def train(model, optimizer, train_iter, loss_function, total_steps, summary_steps, teacher_forcing_ratio):
    model.train()
    model.zero_grad()
    losses = []
    loss_sum = 0.0

    for step in range(summary_steps):
        sources, targets = next(train_iter)
        sources, targets = sources.to(device), targets.to(device)
        outputs, preds = model(sources, targets, teacher_forcing_ratio)
        # outputs shape: [batch_size, sentence_length, vocab_size]
        # preds shape: [batch_size, sentence_length]
        
        # ignore the 1st token of targets since it's <BOS>
        outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
        # outputs shape: [batch_size * sentence_length, vocab_size]
        targets = targets[:, 1:].reshape(-1, targets)
        # targets shape: [batch_size * sentence_length]
        loss = loss_function(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        # avoid gradient explosion
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        loss_sum += loss.item()

        if (step + 1) % 5 == 0:
            loss_sum = loss_sum / 5
                  print ('\r', 'train [{}] loss: {:.3f}, Perplexity: {:.3f}      '.format(total_steps + step + 1, loss_sum, np.exp(loss_sum)), end=' ')
            losses.append(loss_sum)
            loss_sum = 0.0
    return model, optimizer, losses

## Testing

In [None]:
def test(model, dataloader, loss_function):
    model.eval() # set to evaluation mode
    loss_sum, bleu_score = 0.0, 0.0
    n = 0
    result = []

    for sources, targets in dataloader:
        sources, targets = sources.to(device), targets.to(device)
        batch_size = sources.size(0)
        outputs, preds = model.inference(sources, targets)

        # trim out the 1st character since it's <BOS>
        outputs = outputs[:, 1:, ].reshape(-1, outputs.size(2))
        # outputs shape: [batch_size * sentence_length, vocab_size]
        targets = targets[:, 1:].reshape(-1)
        # targets shape: [batch_size * sentence_length]

        loss = loss_function(outputs, targets)
        loss_sum += loss.item()

        # transform the prediction into word
        targets = targets.view(batch_size, -1)
        # targets shape: [batch_size, vocab size]

        # translate the prediction
        preds = tokens2sentence(preds, dataloader.dataset.int2word_cn)
        # preds shape: [batch_size, sentence_length]

        # the sentence to be translated
        sources = tokens2sentence(sources, dataloader.dataset.int2word_en)

        # the label
        targets = tokens2sentence(targets, dataloader.dataset.int2word_cn)

        for source, pred, target in zip(targets, preds, targets):
            result.append((source, pred, target))
        
        # compute the bleu score
        bleu_score += computebleu(preds, targets)
        n += batch_size
    
    return loss_sum / len(dataloader), batch_size / n, result

## Training pipeline

In [1]:
def train_process(config):
    # prepare the training data
    train_dataset = ENG2CNDataset(config.data_path, config.max_output_len, 'training')
    train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    train_iter = inifite_iter(train_loader)

    # prepare the validation data
    val_dataset = ENG2CNDataset(config.data_path, config.max_output_len, 'validation')
    val_loader = data.DataLoader(val_dataset, batch_size=2) # can not set the batch_size = 1 since PyTorch will downgrade the dimension automatically

    # build the model
    model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)

    train_losses, val_losses, bleu_scores = [], [], []
    total_steps = 0

    while (total_steps < config.num_steps):
        # training
        model, optimier, loss = train(model, optimizer, train_iter, loss_function, total_steps, config.summary_steps, teacher_forcing_ratio=teacher_forcing_ratio)
        train_losses += loss
        # validation
        val_loss, bleu_score, result = test(model, val_loader, loss_function)
        val_losses.append(val_loss)
        bleu_scores.append(bleu_score)

        total_steps += config.summary_steps
        
        print('r', 'val[{}] loss: {:.3f}, Perplexity: {:.3f}, blue score: {:.3f}       '.format(total_steps, val_loss, np.exp(val_loss),bleu_score))

        # save the model and result
        if total_steps % config.store_steps == 0 or total_steps >= config.num_steps:
            save_model(model, config.store_model_path, total_steps)
        
    return train_losses, val_losses, bleu_scores, result

# Configuration

In [None]:
class Configurations:
    def __init__(self):
        self.batch_size = 60
        self.emb_dim = 256
        self.dec_hid_dim = 512
        self.enc_hid_dim = 512
        self.n_layers = 3
        self.dropout = 0.5
        self.learning_rate = 0.0005
        self.max_output_len = 50
        self.num_steps = 12000                          # total training times
        self.store_steps = 300                          # save the model after every 'store_steps'
        self.summary_steps = 300                        # test for every 'summary_steps' to see if it's over-fitting
        self.load_model = False                         # determine if load the model
        self.store_model_path = './ckpt'                # the path for storing the model
        self.load_model_path = None                     # the path for loading the model e.g. "./ckpt/model_{step}" 
        self.data_path = './cmn-eng'                    # the path for getting the data
        self.attention = True                           # use Attention or not
        self.teacher_forcing_ratio = 0.5

# Start Training

In [None]:
if __name__ == '__main__':
    config = Configurations()
    print('config:\n', vars(config))
    train_losses, val_losses, bleu_scores,result = train_process(config)

# Start Testing

In [None]:
# Before testing, go to config to set up the path for loading the model
if __name__ == '__main__':
  config = configurations()
  print ('config:\n', vars(config))
  test_loss, bleu_score = test_process(config)
  print (f'test loss: {test_loss}, bleu_score: {bleu_score}')