In [1]:
import os
import torch
import wandb
import numpy as np
from tqdm import tqdm
from secret import WANDB_API_KEY
from src.model import Transformer
from src.utils import subsequent_mask
from src.training import (
    Batch, NoamOptimizer,
    LabelSmoothing, train_step
)
from matplotlib import pyplot as plt
from src.utils import subsequent_mask
from src.blocks import PositionalEncoding
from src.training import NoamOptimizer, LabelSmoothing

In [2]:
os.environ['WANDB_API_KEY'] = WANDB_API_KEY
os.environ['WANDB_NOTEBOOK_NAME'] = 'Noob_Test'
wandb.init(project="transformer-pytorch", name="noob-test")

W&B Run: https://app.wandb.ai/19soumik-rakshit96/transformer-pytorch/runs/19n8lu61

## Run Sanity Tests

In [3]:
plt.figure(figsize=(5, 5))
plt.imshow(subsequent_mask(20)[0])
plt.title('Test for Subsequent Mask')
wandb.log({'Test for Subsequent Mask': plt})
plt.close()

In [4]:
plt.figure(figsize=(15, 5))
pe = PositionalEncoding(20, 0)
y = pe.forward(
    torch.autograd.Variable(
        torch.zeros(1, 100, 20)
    )
)
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])
plt.title('Test for Positional Encoding')
wandb.log({'Test for Positional Encoding': wandb.Image(plt)})
plt.close()

In [5]:
optimizers = [
    NoamOptimizer(512, 1, 4000, None),
    NoamOptimizer(512, 1, 8000, None),
    NoamOptimizer(256, 1, 4000, None)
]
plt.plot(
    np.arange(1, 20000),
    [[opt.rate(i) for opt in optimizers] for i in range(1, 20000)]
)
plt.legend(["512:4000", "512:8000", "256:4000"])
plt.title('Test for Noam Learning Rate Policy')
wandb.log({'Test for Noam Learning Rate Policy': wandb.Image(plt)})
plt.close()

In [6]:
criterion = LabelSmoothing(5, 0, 0.4)
predict = torch.FloatTensor(
    [
        [0, 0.2, 0.7, 0.1, 0],
        [0, 0.2, 0.7, 0.1, 0],
        [0, 0.2, 0.7, 0.1, 0]
    ]
)
v = criterion(
    torch.autograd.Variable(predict.log()),
    torch.autograd.Variable(
        torch.LongTensor([2, 1, 0])
    )
)
plt.imshow(criterion.true_dist)
plt.title('Test for Label Smoothing (Target Distribution)')
wandb.log({'Test for Label Smoothing (Target Distribution)': plt})
plt.close()



In [7]:
criterion = LabelSmoothing(5, 0, 0.1)

def loss(x):
    d = x + 3 * 1
    prediction = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d], ])
    return criterion(
        torch.autograd.Variable(prediction.log()),
        torch.autograd.Variable(torch.LongTensor([1]))).data.item()

plt.plot(np.arange(1, 100), [loss(x) for x in range(1, 100)])
plt.title('Test for Label Smoothing (Regularization)')
wandb.log({'Test for Label Smoothing (Regularization)': wandb.Image(plt)})
plt.close()

[34m[1mwandb[0m: [32m[41mERROR[0m Error uploading "config.yaml": FileNotFoundError, [Errno 2] No such file or directory: '/tmp/tmpnr0jv7uqwandb/2bv8aijp-config.yaml'


## Noob Test

In [8]:
def data_gen(V, batch, nbatches):
    for i in range(nbatches):
        data = torch.from_numpy(
            np.random.randint(
                1, V, size=(batch, 10)
            )
        )
        data[:, 0] = 1
        src = torch.autograd.Variable(data, requires_grad=False)
        tgt = torch.autograd.Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)

In [9]:
class SimpleLossCompute:
    
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(
            x.contiguous().view(-1, x.size(-1)),
            y.contiguous().view(-1)
        ) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data.item() * norm

In [10]:
V = 11
model = Transformer(V, V, n=2)
criterion = LabelSmoothing(
    size=V, padding_index=0, smoothing=0.0
)
model_opt = NoamOptimizer(
    model.source_embedding[0].d_model, 1, 400,
    torch.optim.Adam(
        model.parameters(), lr=0,
        betas=(0.9, 0.98), eps=1e-9
    )
)

wandb.watch(model)

  torch.nn.init.xavier_uniform(p)


[<wandb.wandb_torch.TorchGraph at 0x7f56f9ed5940>]

In [None]:
for epoch in range(10):
    print('Epoch:', (epoch + 1))
    model.train()
    train_step(
        data_gen(V, 30, 20), model, 
        SimpleLossCompute(
            model.generator,
            criterion, model_opt
        ), log_on_wandb=True
    )
    model.eval()
    train_step(
        data_gen(V, 30, 5), model, 
        SimpleLossCompute(
            model.generator,
            criterion, None
        ), log_on_wandb=True
    )

0it [00:00, ?it/s]

Epoch: 1


20it [00:07,  2.70it/s]
5it [00:01,  4.12it/s]
0it [00:00, ?it/s]

Epoch: 2


20it [00:06,  3.12it/s]
5it [00:01,  4.58it/s]
0it [00:00, ?it/s]

Epoch: 3


20it [00:06,  3.05it/s]
5it [00:01,  3.98it/s]
0it [00:00, ?it/s]

Epoch: 4


20it [00:06,  3.13it/s]
5it [00:01,  2.93it/s]
0it [00:00, ?it/s]

Epoch: 5


20it [00:06,  2.90it/s]
5it [00:01,  3.14it/s]
0it [00:00, ?it/s]

Epoch: 6


20it [00:06,  2.93it/s]
5it [00:01,  3.84it/s]
0it [00:00, ?it/s]

Epoch: 7


20it [00:06,  3.02it/s]
5it [00:01,  4.18it/s]
0it [00:00, ?it/s]

Epoch: 8


20it [00:06,  2.96it/s]
5it [00:01,  2.84it/s]
0it [00:00, ?it/s]

Epoch: 9


11it [00:03,  3.07it/s]

In [None]:
def greedy_decode(model, source, source_mask, max_length, start_symbol):
    memory = model.encode(source, source_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(source.data)
    for i in range(max_length - 1):
        out = model.decode(
            memory, source_mask,
            torch.autograd.Variable(ys), 
            torch.autograd.Variable(
                subsequent_mask(ys.size(1)).type_as(source.data)
            )
        )
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.ones(1, 1).type_as(source.data).fill_(next_word)], dim=1
        )
    return ys

In [None]:
model.eval()
source = torch.autograd.Variable(
    torch.LongTensor(
        [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
    )
)
source_mask = torch.autograd.Variable(
    torch.ones(1, 1, 10)
)
print(
    greedy_decode(
        model, source, source_mask,
        max_length=10, start_symbol=1
    )
)