In [117]:
import numpy as np
import torch 
import torch.nn as nn 
from transformers import Transformer
from transformer_train import Batch, NoamOptimizer

In [118]:
transformer = Transformer(input_vocab=10, output_vocab=10)

Try using the transformer with None mask

In [119]:
input = torch.randint(10, (60,5))
output = torch.randint(10, (60,10))
transformer(input, output, None, None)

tensor([[[-3.2956, -5.5360, -3.9026,  ..., -2.2101, -5.1254, -2.0607],
         [-3.1119, -4.2495, -3.3671,  ..., -1.4571, -4.2381, -1.4444],
         [-2.7520, -3.7801, -2.3611,  ..., -1.6817, -3.7349, -3.0017],
         ...,
         [-2.6767, -3.8117, -2.3582,  ..., -1.7804, -3.6750, -2.8720],
         [-3.1573, -5.9229, -3.5439,  ..., -2.5666, -5.2111, -1.9874],
         [-1.8012, -4.4187, -3.2388,  ..., -2.3578, -3.4859, -1.8860]],

        [[-2.7531, -5.1500, -5.0741,  ..., -0.4916, -4.0239, -3.6269],
         [-3.2632, -2.8007, -2.6924,  ..., -0.7778, -3.2491, -5.0723],
         [-3.8677, -3.5181, -4.3830,  ..., -0.3828, -3.6132, -3.7635],
         ...,
         [-2.7543, -5.2842, -5.1500,  ..., -0.4658, -4.0015, -3.6083],
         [-1.6570, -3.0861, -4.0031,  ..., -1.3562, -2.7431, -3.8556],
         [-3.4830, -3.4247, -4.2928,  ..., -0.4624, -3.7082, -3.6337]],

        [[-3.1270, -3.5810, -1.9032,  ..., -2.7086, -3.3358, -2.8965],
         [-3.3003, -4.0092, -2.2468,  ..., -2

Try using the transformer with real mask

In [120]:
input = torch.randint(10, (60,5))
output = torch.randint(10, (60,10))
input_mask = torch.randint(2, (60, 1, 5))
output_mask = torch.randint(2, (60, 10, 10))
transformer(input, output, input_mask, output_mask)

tensor([[[-3.0528, -5.5559, -2.0790,  ..., -1.0352, -3.4822, -2.6008],
         [-3.6055, -3.3336, -2.2586,  ..., -1.6918, -2.0672, -2.6547],
         [-4.8799, -5.0606, -2.4573,  ..., -1.5500, -2.7496, -1.9709],
         ...,
         [-3.9195, -5.5816, -3.4783,  ..., -2.5231, -3.1341, -2.3260],
         [-3.4349, -3.6073, -2.1571,  ..., -1.9795, -1.9738, -2.4010],
         [-2.6095, -4.9476, -2.3122,  ..., -1.3448, -2.9295, -2.5103]],

        [[-3.4173, -3.0684, -3.9996,  ..., -5.5370, -4.3073, -5.7417],
         [-2.2009, -1.6852, -1.7232,  ..., -3.5195, -3.5730, -3.3904],
         [-1.7419, -1.3112, -2.9169,  ..., -4.5100, -2.8470, -4.4936],
         ...,
         [-2.5086, -4.1357, -2.4378,  ..., -3.4512, -4.6265, -3.9577],
         [-2.5960, -1.7321, -2.2847,  ..., -3.4647, -3.4665, -3.7475],
         [-1.4079, -2.1173, -2.5516,  ..., -4.4490, -3.2443, -4.1437]],

        [[-3.4947, -2.8191, -3.0925,  ..., -0.6011, -2.7757, -4.1271],
         [-4.6926, -5.4174, -4.1692,  ..., -0

## Creating and training transformer for copy task

In [121]:
num_vocab = 11
seq_len = 10
model_dim = 512

In [122]:
def copy_data_generator(num_batches=30, batch_size=20):
    for _ in range(num_batches):
        data = np.random.randint(num_vocab, size=(batch_size, seq_len))
        src = torch.from_numpy(data)
        tgt = torch.from_numpy(data.copy())
        yield Batch(src, tgt, 0)

In [123]:
transformer = Transformer(input_vocab=num_vocab, output_vocab=num_vocab, model_dim=model_dim, num_coder=2)

In [124]:
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
noam_optimizer = NoamOptimizer(adam_optimizer, model_dim, 1, 400)

In [125]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

for epoch in range(10):
    transformer.train()
    data_iter = copy_data_generator()

    losses = 0.
    cnt = 0

    for i, batch in enumerate(data_iter):
        out = transformer(batch.src, batch.trg, batch.src_mask, batch.trg_mask)

        out = out.reshape(-1, out.shape[-1])
        labels = batch.trg_y.reshape(-1)

        loss = loss_fn(out, labels)
        
        loss.backward()
        noam_optimizer.step()
        noam_optimizer.optimizer.zero_grad()

        losses += loss.item()
        cnt += 1

    print(epoch, losses / cnt)

0 2.302064045270284
1 1.8219502607981364
2 1.7427995483080545
3 1.2776670634746552
4 0.9659743865331014
5 0.8486938496430715
6 0.7245328962802887
7 0.7513565500577291
8 0.7873012860616048
9 0.8102462132771809


## Testing transformer for copy task

In [126]:
input = np.random.randint(num_vocab, size=(10, seq_len))

expected_output = torch.from_numpy(input.copy()) 
output_init = torch.from_numpy(input[:,0].reshape(-1,1))
input = torch.from_numpy(input)
input_mask = torch.ones(10, 1, seq_len)

In [127]:
output = transformer.greedy_decode(input, output_init, seq_len, input_mask)

In [128]:
expected_output

tensor([[ 3,  2,  9,  0,  4, 10,  5,  9,  5,  6],
        [ 4,  0,  1,  8,  2,  3, 10,  4, 10, 10],
        [ 9,  6,  7,  8,  9,  5, 10, 10,  3,  9],
        [ 1,  4,  8,  4,  9,  3,  9, 10,  5,  8],
        [ 5,  5,  6,  3,  8,  8,  3,  8,  9,  6],
        [ 2,  9,  8,  1,  9,  1,  4,  0, 10,  4],
        [ 1,  4,  5,  4,  8,  4,  0,  9,  6,  2],
        [ 1,  6,  9,  9,  3,  8,  6,  2,  7,  5],
        [ 8,  4,  0,  2, 10,  8,  2,  7,  8,  3],
        [ 3,  7,  4,  9,  7,  3,  0,  9,  8,  4]])

In [129]:
output

tensor([[ 3,  2,  9,  3,  4, 10,  5,  9,  5,  9],
        [ 4,  1,  1,  8,  3,  3, 10, 10,  4, 10],
        [ 9,  7,  7,  8,  9,  5, 10, 10, 10,  9],
        [ 1,  4,  8,  4,  9,  3,  9, 10,  9,  8],
        [ 5, 10,  6,  3,  8,  3,  3,  8,  9,  6],
        [ 2,  9,  8,  1,  1,  1,  4, 10, 10, 10],
        [ 1,  4,  5,  4,  8,  4,  3,  9,  6,  2],
        [ 1,  6,  9,  9,  3,  8,  6,  2,  7,  5],
        [ 8,  4,  3,  2, 10,  8,  2,  7,  8,  8],
        [ 3,  7,  4,  9,  7,  3,  4,  9,  8,  8]])

In [130]:
# get accuracy 
acc = (output == expected_output).sum() / (10 * seq_len)
acc.item()

0.8100000023841858

## Creating and training transformer for sequence reversal

In [131]:
num_vocab = 11
seq_len = 10
model_dim = 512

In [132]:
def reverse_data_generator(num_batches=30, batch_size=20):
    for _ in range(num_batches):
        data = np.random.randint(num_vocab, size=(batch_size, seq_len))
        src = torch.from_numpy(data)
        tgt = torch.from_numpy(data[:, ::-1].copy())
        yield Batch(src, tgt, 0)

In [133]:
transformer = Transformer(input_vocab=num_vocab, output_vocab=num_vocab, model_dim=model_dim, num_coder=2)

In [134]:
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
noam_optimizer = NoamOptimizer(adam_optimizer, model_dim, 1, 400)

In [135]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

for epoch in range(10):
    transformer.train()
    data_iter = reverse_data_generator()

    losses = 0.
    cnt = 0

    for i, batch in enumerate(data_iter):
        out = transformer(batch.src, batch.trg, batch.src_mask, batch.trg_mask)

        out = out.reshape(-1, out.shape[-1])
        labels = batch.trg_y.reshape(-1)

        loss = loss_fn(out, labels)
        
        loss.backward()
        noam_optimizer.step()
        noam_optimizer.optimizer.zero_grad()

        losses += loss.item()
        cnt += 1

    print(epoch, losses / cnt)

0 2.27563191652298
1 1.8106411576271058
2 1.6703855593999226
3 1.313740042845408
4 0.948525055249532
5 0.8531302829583486
6 0.8086230039596558
7 0.6997813681761423
8 0.7196457803249359
9 0.7307352046171824


## Testing transformer for sequence reversal

In [136]:
input = np.random.randint(num_vocab, size=(10, seq_len))

expected_output = torch.from_numpy(input[:, ::-1].copy()) 
output_init = torch.from_numpy(input[:,-1].reshape(-1,1))
input = torch.from_numpy(input)
input_mask = torch.ones(10, 1, seq_len)

In [137]:
output = transformer.greedy_decode(input, output_init, seq_len, input_mask)

In [138]:
expected_output

tensor([[ 7,  7,  7,  5,  8,  6,  4,  2,  1,  5],
        [ 4,  3,  5,  7,  8,  1,  8,  8,  9, 10],
        [ 8,  9,  9,  9,  8,  1,  9,  2,  6,  0],
        [10,  2,  9,  2,  8,  5,  5,  3,  5,  0],
        [ 6,  4,  4,  6,  5,  9,  9,  8,  6,  3],
        [ 0,  7,  2,  7,  9,  9,  7,  0,  9,  0],
        [ 4,  6,  7,  7,  8,  4,  7,  1,  3,  5],
        [ 6,  2,  5, 10,  4,  9,  5, 10,  4,  0],
        [ 8,  9, 10,  7,  9,  7,  4,  0,  9, 10],
        [ 9,  2,  0,  0,  9,  7,  0,  3,  2,  6]])

In [139]:
output

tensor([[ 7,  7,  5,  7,  8,  6,  4,  2,  1,  5],
        [ 4,  3,  5,  7,  8,  1,  8,  9, 10, 10],
        [ 8,  9,  9,  9,  8,  1,  9,  2,  6,  1],
        [10,  2,  9,  2,  8,  5,  5,  3,  5,  9],
        [ 6,  4,  6,  5,  6,  9,  9,  8,  6,  3],
        [ 0,  9,  7,  2,  9,  9,  7,  9,  9,  9],
        [ 4,  6,  7,  7,  8,  4,  7,  1,  3,  5],
        [ 6,  2,  5, 10,  4,  9,  5, 10,  4,  1],
        [ 8,  9, 10,  7,  9,  7,  4,  9, 10,  9],
        [ 9,  2,  9,  9,  7,  9,  7,  3,  2,  6]])

In [140]:
# get accuracy 
acc = (output == expected_output).sum() / (10 * seq_len)
acc.item()

0.7699999809265137

## Creating and training transformer for sequence sorting

In [141]:
num_vocab = 11
seq_len = 5
model_dim = 512

In [142]:
def sort_data_generator(num_batches=30, batch_size=20):
    for _ in range(num_batches):
        data = np.random.randint(num_vocab, size=(batch_size, seq_len))
        src = torch.from_numpy(data)
        tgt = torch.from_numpy(np.sort(data))
        yield Batch(src, tgt, 0)

In [143]:
transformer = Transformer(input_vocab=num_vocab, output_vocab=num_vocab, model_dim=model_dim, num_coder=2)

In [144]:
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
noam_optimizer = NoamOptimizer(adam_optimizer, model_dim, 1, 400)

In [145]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

for epoch in range(10):
    transformer.train()
    data_iter = sort_data_generator()

    losses = 0.
    cnt = 0

    for i, batch in enumerate(data_iter):
        out = transformer(batch.src, batch.trg, batch.src_mask, batch.trg_mask)

        out = out.reshape(-1, out.shape[-1])
        labels = batch.trg_y.reshape(-1)

        loss = loss_fn(out, labels)
        
        loss.backward()
        noam_optimizer.step()
        noam_optimizer.optimizer.zero_grad()

        losses += loss.item()
        cnt += 1

    print(epoch, losses / cnt)

0 1.6308037598927816
1 0.9159865935643514
2 0.8626769820849101
3 0.8391406059265136
4 0.8934044241905212
5 0.8539911985397339
6 0.9117301285266877
7 0.919150443871816
8 0.9773162444432576
9 0.9681901276111603


## Testing transformer for sequence sorting

In [146]:
input = np.random.randint(num_vocab, size=(10, seq_len))

expected_output = torch.from_numpy(np.sort(input))
output_init = torch.from_numpy(np.sort(input)[:,0].reshape(-1,1))
input = torch.from_numpy(input)
input_mask = torch.ones(10, 1, seq_len)

In [147]:
output = transformer.greedy_decode(input, output_init, seq_len, input_mask)

In [148]:
expected_output

tensor([[ 0,  2,  6,  6,  8],
        [ 3,  3,  6,  6,  7],
        [ 3,  4,  5,  7, 10],
        [ 3,  4,  4, 10, 10],
        [ 0,  0,  4,  6,  8],
        [ 0,  0,  5, 10, 10],
        [ 3,  4,  6,  9, 10],
        [ 1,  2,  3,  6,  8],
        [ 1,  6,  7,  9, 10],
        [ 0,  2,  2,  5,  5]])

In [149]:
output

tensor([[ 0,  2,  4,  6,  8],
        [ 3,  3,  3,  6,  7],
        [ 3,  4,  5,  7, 10],
        [ 3,  4,  4,  4, 10],
        [ 0,  1,  4,  6,  8],
        [ 0,  5,  5,  5, 10],
        [ 3,  4,  4,  4,  9],
        [ 1,  2,  3,  6,  8],
        [ 1,  7,  9, 10, 10],
        [ 0,  2,  5,  5,  5]])

In [150]:
# get accuracy 
acc = (output == expected_output).sum() / (10 * seq_len)
acc.item()

0.7400000095367432