In [2]:
import torch
import torch.nn as nn

from torch.optim.lr_scheduler import LambdaLR

from transformer.training import Batch, run_epoch
from transformer.transformer import make_model, greedy_decode

In [3]:
RUN_EXAMPLES = True

In [4]:
class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None
        
def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)

In [5]:
# This will generate a random batch of data where src and 
# tgt are the same sequence of random integers

def data_gen(V, batch_size, nbatches, device):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.randint(1, V, size=(batch_size, 10))
        data[:, 0] = 1
        src = data.requires_grad_(False).clone().detach().to(device)
        tgt = data.requires_grad_(False).clone().detach().to(device)
        yield Batch(src, tgt, 0)

In [6]:
class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    #? why would / by norm be needed and then * by norm?
    def __call__(self, x, y, norm):
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
            / norm
        )
        # 1st term is used for reporting, 2nd term is used for backprop
        return sloss.data * norm, sloss
    
class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = torch.zeros_like(x.data)
        true_dist.fill_(self.smoothing / (self.size - 2))
        # https://yuyangyy.medium.com/understand-torch-scatter-b0fd6275331c
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        return self.criterion(x, true_dist.clone().detach())

In [7]:
# This is for LambdaLR function
# https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.LambdaLR.html

# This regulates learning for each step
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [8]:
def example_simple_model(device="cuda"):
    V = 11 # source and target vocab size 
    # Use criterion that scatters uncertainty (1-p) over all labels before applying KLDivLoss
    criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0).to(device)
    # initialize Transformer model
    model = make_model(V, V, N=2).to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400
        ),
    )

    batch_size = 80
    for epoch in range(20):
        model.train()
        run_epoch(
            data_gen(V, batch_size, 20, device),
            model,
            SimpleLossCompute(model.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train",
        )
        model.eval()

    model.eval()
    src = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]).to(device)
    max_len = src.shape[1]
    src_mask = torch.ones(1, 1, max_len).to(device)
    # greedy_decode is one stratery of unrolling transformer prediction
    print(greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=0))

In [9]:
# Model would learn to copy the input sequence
execute_example(example_simple_model)

# cuda Tokens / Sec on 2080Ti:  1.35e+04, cpu: 700
# 19 times faster on GPU than CPU

Epoch Step:      1 | Accumulation Step:   2 | Loss:   3.02 | Tokens / Sec:   624.4 | Learning Rate: 5.5e-06
Epoch Step:      1 | Accumulation Step:   2 | Loss:   2.01 | Tokens / Sec: 16720.5 | Learning Rate: 6.1e-05
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.75 | Tokens / Sec: 15633.4 | Learning Rate: 1.2e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.43 | Tokens / Sec: 12822.0 | Learning Rate: 1.7e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   1.04 | Tokens / Sec: 13724.0 | Learning Rate: 2.3e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.59 | Tokens / Sec: 16807.5 | Learning Rate: 2.8e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.36 | Tokens / Sec: 16314.8 | Learning Rate: 3.4e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.21 | Tokens / Sec: 16987.2 | Learning Rate: 3.9e-04
Epoch Step:      1 | Accumulation Step:   2 | Loss:   0.20 | Tokens / Sec: 17218.1 | Learning Rate: 4.5e-04
Epoch Step:      1 | Accumul

NameError: name 'greedy_decode' is not defined