In [1]:
%matplotlib inline


Sequence-to-Sequence Modeling with nn.Transformer and TorchText
===============================================================

This is a tutorial on how to train a sequence-to-sequence model
that uses the
`nn.Transformer <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformer#torch.nn.Transformer>`__ module.

PyTorch 1.2 release includes a standard transformer module based on the
paper `Attention is All You
Need <https://arxiv.org/pdf/1706.03762.pdf>`__. The transformer model
has been proved to be superior in quality for many sequence-to-sequence
problems while being more parallelizable. The ``nn.Transformer`` module
relies entirely on an attention mechanism (another module recently
implemented as `nn.MultiheadAttention <https://pytorch.org/docs/master/nn.html?highlight=multiheadattention#torch.nn.MultiheadAttention>`__) to draw global dependencies
between input and output. The ``nn.Transformer`` module is now highly
modularized such that a single component (like `nn.TransformerEncoder <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformerencoder#torch.nn.TransformerEncoder>`__
in this tutorial) can be easily adapted/composed.

![](../_static/img/transformer_architecture.jpg)





Define the model
----------------




In this tutorial, we train ``nn.TransformerEncoder`` model on a
language modeling task. The language modeling task is to assign a
probability for the likelihood of a given word (or a sequence of words)
to follow a sequence of words. A sequence of tokens are passed to the embedding
layer first, followed by a positional encoding layer to account for the order
of the word (see the next paragraph for more details). The
``nn.TransformerEncoder`` consists of multiple layers of
`nn.TransformerEncoderLayer <https://pytorch.org/docs/master/nn.html?highlight=transformerencoderlayer#torch.nn.TransformerEncoderLayer>`__. Along with the input sequence, a square
attention mask is required because the self-attention layers in
``nn.TransformerEncoder`` are only allowed to attend the earlier positions in
the sequence. For the language modeling task, any tokens on the future
positions should be masked. To have the actual words, the output
of ``nn.TransformerEncoder`` model is sent to the final Linear
layer, which is followed by a log-Softmax function.




``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [2]:
from collections import Counter
import math
import time

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import torchtext
from torchtext.data.utils import get_tokenizer


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, dimension: int, dropout: int=0.1, max_length: int=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        positional_values = self.__generate_position_values(dimension, max_length)
        self.register_buffer("positional_values", positional_values)
        
    @staticmethod
    def __generate_position_values(dimension: int, max_length: int):
        values = torch.zeros(max_length, dimension)
        positions = torch.arange(0, max_length, dtype=torch.float)
        positions = positions.unsqueeze(1)
        
        scaling_steps = torch.arange(0, dimension, 2).float()
        scaling = torch.exp(scaling_steps * (-math.log(10000.0)/dimension))
        
        values[:, 0::2] = torch.sin(positions * scaling)
        values[:, 1::2] = torch.cos(positions * scaling)
        values = values.unsqueeze(0).transpose(0, 1)
        
        return values
    
    def forward(self, x):
        x = x + self.positional_values[:x.size(0), :]
        return self.dropout(x)

In [4]:
class TransformerScriptGenerator(nn.Module):
    def __init__(self,
                 vocabulary_size: int,
                 embedding_dim: int,
                 attention_head_count: int,
                 encoder_fc_dim: int,
                 encoder_layer_count: int,
                 dropout: float=0.5) -> None:
        
        super().__init__()
        
        self.input_mask = None
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.embedding_scale = math.sqrt(embedding_dim)
        self.positional_encoder = PositionalEncoding(embedding_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, attention_head_count, encoder_fc_dim, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, encoder_layer_count)
        self.decoder = nn.Linear(embedding_dim, vocabulary_size)
        
        self.__init_weights()
        
    def __init_weights(self) -> None:
        value_range = 0.1
        self.embedding.weight.data.uniform_(-value_range, value_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-value_range, value_range)
        
        
    @staticmethod
    def __generate_input_mask(mask_size: int) -> torch.Tensor():
        mask = torch.ones(mask_size, mask_size, dtype=bool)
        mask = torch.triu(mask).t().float()
        mask = mask.masked_fill(mask == 0, float('-inf'))
        mask = mask.masked_fill(mask == 1, 0.0)
        return mask
    
    @staticmethod
    def __get_output_for_last_word(full_output: torch.Tensor) -> torch.Tensor:
        return full_output[:,-1,:]
    
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        mask_size = input.shape[0]
        if self.input_mask is None or self.input_mask.size(0) != mask_size:
            self.input_mask = self.__generate_input_mask(mask_size).to(device)
            
        input = self.embedding(input) * self.embedding_scale
        input = self.positional_encoder(input)
        output = self.transformer_encoder(input, self.input_mask)
        #output = self.__get_output_for_last_word(output)
        output = self.decoder(output)
        return output

Load and batch data
-------------------




The training process uses Wikitext-2 dataset from ``torchtext``. The
vocab object is built based on the train dataset and is used to numericalize
tokens into tensors. Starting from sequential data, the ``batchify()``
function arranges the dataset into columns, trimming off any tokens remaining
after the data has been divided into batches of size ``batch_size``.
For instance, with the alphabet as the sequence (total length of 26)
and a batch size of 4, we would divide the alphabet into 4 sequences of
length 6:

\begin{align}\begin{bmatrix}
  \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
  \end{bmatrix}
  \Rightarrow
  \begin{bmatrix}
  \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
  \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
  \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
  \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
  \end{bmatrix}\end{align}

These columns are treated as independent by the model, which means that
the dependence of ``G`` and ``F`` can not be learned, but allows more
efficient batch processing.




Functions to generate input and target sequence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




``get_batch()`` function generates the input and target sequence for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

![](../_static/img/transformer_input_target.png)


It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.




In [5]:
def text_to_tensor(text_field: torchtext.data.Field, input_text) -> torch.Tensor:
    return text_field.numericalize([input_text.examples[0].text])


def load_and_split_data(): 
    '''
    Returns: training_text, validation_text, test_text, vocabulary_object
    '''
    text_field = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                                      init_token='<sos>',
                                      eos_token='<eos>',
                                      lower=True)
    train, validation, test = torchtext.datasets.WikiText2.splits(text_field)
    text_field.build_vocab(train)
    train = text_to_tensor(text_field, train)
    validation = text_to_tensor(text_field, validation)
    test = text_to_tensor(text_field, test)
    return train, validation, test, text_field.vocab
                           

def divide_into_parallel_data_streams(data: torch.Tensor, stream_count: int) -> torch.Tensor:
    stream_length = data.size(0) // stream_count
    data = data.narrow(0, 0, stream_length * stream_count)
    data = data.view(stream_count, -1).t().contiguous()
    return data.to(device)


def batch_loader(source, max_sequence_length: int) -> (torch.Tensor, torch.Tensor):
    total_row_count = source.size(0)
    # -1 to account for the target sequence shift
    full_batch_count = (total_row_count - 1) // max_sequence_length
    for batch_index in range(full_batch_count):
        first_row_index = batch_index * max_sequence_length
        last_row_index = first_row_index + max_sequence_length
        inputs = source[first_row_index: last_row_index]
        targets = source[first_row_index+1: last_row_index+1].view(-1)
        yield inputs, targets
        
    first_row_index = full_batch_count * max_sequence_length
    inputs = source[first_row_index:-1]
    targets = source[first_row_index+1:].view(-1)
    yield inputs, targets

In [6]:
train_data, val_data, test_data, vocab = load_and_split_data()

batch_size = 20
eval_batch_size = 10

train_data = divide_into_parallel_data_streams(train_data, batch_size)
val_data = divide_into_parallel_data_streams(val_data, eval_batch_size)
test_data = divide_into_parallel_data_streams(test_data, eval_batch_size)

Initiate an instance
--------------------




The model is set up with the hyperparameter below. The vocab size is
equal to the length of the vocab object.




In [7]:
ntokens = len(vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerScriptGenerator(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerScriptGenerator(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

| end of epoch  10 | time: 211.53s | valid loss  5.36 | valid ppl   212.82

Run the model
-------------




`CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
is applied to track the loss and
`SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
implements stochastic gradient descent method as the optimizer. The initial
learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
applied to adjust the learn rate through epochs. During the
training, we use
`nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
function to scale all the gradient together to prevent exploding.




In [12]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

input_sequence_length = 35

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, input_sequence_length):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)


def train2():
    
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(vocab.stoi)
    
    for batch_index, (data, targets)  in enumerate(batch_loader(train_data, input_sequence_length)):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch_index % log_interval == 0 and batch_index > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch_index, len(train_data) // input_sequence_length, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [13]:
best_val_loss = float("inf")
epochs = 1 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train2()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 2981 batches | lr 5.00 | ms/batch 60.45 | loss  5.84 | ppl   343.92
| epoch   1 |   400/ 2981 batches | lr 5.00 | ms/batch 61.64 | loss  5.83 | ppl   339.91
| epoch   1 |   600/ 2981 batches | lr 5.00 | ms/batch 60.38 | loss  5.67 | ppl   289.49
| epoch   1 |   800/ 2981 batches | lr 5.00 | ms/batch 60.73 | loss  5.68 | ppl   294.18
| epoch   1 |  1000/ 2981 batches | lr 5.00 | ms/batch 60.36 | loss  5.62 | ppl   276.54
| epoch   1 |  1200/ 2981 batches | lr 5.00 | ms/batch 60.09 | loss  5.64 | ppl   282.77
| epoch   1 |  1400/ 2981 batches | lr 5.00 | ms/batch 60.50 | loss  5.65 | ppl   284.96
| epoch   1 |  1600/ 2981 batches | lr 5.00 | ms/batch 60.08 | loss  5.69 | ppl   294.70
| epoch   1 |  1800/ 2981 batches | lr 5.00 | ms/batch 60.01 | loss  5.62 | ppl   274.56
| epoch   1 |  2000/ 2981 batches | lr 5.00 | ms/batch 59.86 | loss  5.64 | ppl   281.20
| epoch   1 |  2200/ 2981 batches | lr 5.00 | ms/batch 59.77 | loss  5.53 | ppl   252.67
| epoch   1 |  2400/ 

NameError: name 'get_batch' is not defined

Evaluate the model with the test dataset
-------------------------------------

Apply the best model to check the result with the test dataset.



In [None]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [None]:
best_model.eval();
vocab_size = len(TEXT.vocab.stoi)
print(vocab_size)
data, targets = get_batch(train_data, 0)
print(f"BPTT = {bptt}")
print(f"Input shape = {data.shape}")
print(f"Target shape = {targets.shape}")
output = best_model(data)
print(f"Output shape = {output.shape}")
output_flat = output.view(-1, ntokens)
print(f"Output flat shape = {output_flat.shape}")
output_flat[0]

In [None]:
def tensor_to_list(tensor: torch.Tensor) -> list:
    return tensor.cpu().detach().numpy().tolist()


def scores_to_top_tokens(scores: torch.Tensor) -> list:
    _, top_index_tensor = torch.topk(scores, k=1)
    top_index_tensor.squeeze_().t_()
    top_index_list = tensor_to_list(top_index_tensor)
    return top_index_list


def tokens_to_words(batches: list) -> list:
    decoded_batches = []
    for batch in batches:
        decoded_batch = [TEXT.vocab.itos[token] for token in batch]
        decoded_batches.append(decoded_batch)
    return decoded_batches


def decode_transformer_output(scores: torch.Tensor) -> list:
    top_tokens = scores_to_top_tokens(scores)
    predicted_words = tokens_to_words(top_tokens)
    return predicted_words


def decode_targets(targets: torch.Tensor, batch_size: int) -> list:
    unsquashed_targets = targets.view(-1, batch_size).t()
    target_tokens = tensor_to_list(unsquashed_targets)
    return tokens_to_words(target_tokens)


def decode_inputs(inputs: torch.Tensor) -> list:
    input_tokens = tensor_to_list(inputs.t())
    return tokens_to_words(input_tokens)


def join_sequences(decoded_sequences: list) -> list:
    return [" ".join(sequence) for sequence in decoded_sequences]


def present_result(model, inputs: torch.Tensor, targets: torch.Tensor, index: int) -> None:
    model.eval()
    output = model(inputs)
    input_text = join_sequences(decode_inputs(inputs))[index]
    target_text = join_sequences(decode_targets(targets, 20))[index]
    output_text = join_sequences(decode_transformer_output(output))[index]
    print( "INPUT\n-----\n" 
          f"{input_text}\n\n"
           "TARGET\n------\n"
          f"{target_text}\n\n"
           "OUTPUT\n------\n"
          f"{output_text}\n")

In [None]:
present_result(best_model, data, targets, 12)

In [None]:
def predict_next_word(text: str) -> str:
    encoded_text = TEXT.numericalize([text.split()]).to(device)
    custom_out = best_model(encoded_text)
    custom_out.shape
    _, custom_top_tokens = torch.topk(custom_out, k=1)
    out_tokens = tensor_to_list(custom_top_tokens.squeeze())
    return [TEXT.vocab.itos[token] for token in out_tokens][-1]

In [None]:
sentence = "that the primary schools , and the kakapo "
for i in range(30):
    input = " ".join(sentence.split()[-20:])
    print(input)
    sentence += predict_next_word(input) + " "
print(sentence)

In [None]:
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, input_sequence_length):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)