SEQUENCE-TO-SEQUENCE MODELING WITH NN.TRANSFORMER AND TORCHTEXT  
https://pytorch.org/tutorials/beginner/transformer_tutorial.html

assign a probability a given word to follow a sequence of words

nn.Transformer module based on an attention mechanism to draw global dependencies between input and output.  
The module is now highly modularized to  
nn.TransformerEncoder, nn.TransformerEncoderLayer, nn.TransformerDecoder, nn.TrasnformerDecoderLayer

We can simply use nn.Transformer the complete product,  
or compose modularized components to customize the network

nn.TransformerEncoder consists of multiple layers of nn.TransformerEncoderLayer.  
https://pytorch.org/tutorials/_images/transformer_architecture.jpg  
Nx nn.TransformerEncoderLayer in the picture form a TransformerEncoder

nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)  
with d_model=512, nhead=8, (10, 32, 512) -> (10, 32, 512) in the example  
d_model : feature input
nhead: the number of arrows to Multi-Head Attention in the picture  
dim_feedforward  
d_model determines the input and output shape

nn.TransformerEncoder(encoder_layer, num_layers)  
with encoder_layer=the layer above, num_layers=6, (10, 32, 512) -> (10, 32, 512) in the example

in the ordinary Attention the decoder attend the entire input  
in self-attention the encoder refer the entire previous sequence, not just the previous  

Along with the input sequence, a mask is required because the self-attention layers in nn.TransformerEncoder are only allowed to attend the earlier positions in the sequence.

THINKING of the meaning of self-attention, attend the entire sequence before not just before the current  
at t=10, given a seq of len t=20, attend [0: 9], not just [9], but [11:20] should not be accessed

`tensor([[0., -inf, -inf],
         [0., 0., -inf],
         [0., 0., 0.]])`

In [None]:
NOTE that Batch dimension is 1

In [None]:
generate bsz batches by batchfy(data, bsz)
bsz batches are further divided to be len_seq=bptt in get_batch()
since too large len_seq 102499 of train_data will be impossible to train, reducing len_seq would be a remedy

Note that slicing does not break the sequence relations
we still have the correspondence between the target word and each input word

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [2]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.ninp = ninp
    
        self.encoder = nn.Embedding(ntoken, ninp)
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.decoder = nn.Linear(ninp, ntoken)
        
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)        
        
    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

the len of div_term is max_len / 2  
every even columns in pe filled by torch.sin  
every odd columns in pe filled by torch.cos  

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x +  self.pe[:x.size(0), :]  # take the positional encoding up to len of x
        return self.dropout(x)

In [7]:
d_model = 100

In [9]:
max_len = 1000

In [10]:
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

In [11]:
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

In [13]:
position.shape

torch.Size([1000, 1])

In [14]:
div_term.shape

torch.Size([50])

In [15]:
(position* div_term).shape

torch.Size([1000, 50])

In [5]:
import io
import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter)

https://pytorch.org/text/stable/vocab.html  
vocab.stoi(token) == vocab[token]  
vocab[token] is the key for the token, an integer  
vocab.stoi returns the entire dict  
len(vocab.stoi) the size of vocabulary

In [6]:
def data_process(raw_text_iter):
    data = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in raw_text_iter]
    data = torch.cat(list(filter(lambda t: t.numel() > 0, data)))
    return data

train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


train_data is a text, the concatenate of train_iter  
summed to one, and divided to be batchfied

In [8]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)  # along dim 0, from 0 len nbatch*bsz
    data = data.view(bsz, -1).t().contiguous()  # batch dimension is 1
    return data.to(device)

In [9]:
batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [10]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source)-1-i)  # seq_len will be < 35 if the remaining < 35
    data = source[i: i+seq_len]
    target = source[i+1: i+1+seq_len].reshape(-1)
    return data, target

# batches are sliced to length of seq_len and returned
# given [50, 20], returns [35, 20] as possible and [34, 20], [33, 20] ... [1, 20], [0, 20]
# if len(source) == 102499, i=102498 returns []

In [15]:
ntokens = len(vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [None]:
import time
def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate((range(0, train_data.size(0)-1, bptt))):  # i = 0, 35, 70, ...
        data, targets = get_batch(train_data, i)  # shape of [35, 20], [700]
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequet_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        # output.view(-1, ntokens) -> (700, 28783), 700 guesses for the next tokens
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0.
            start_time = time.time()

            
def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [57]:
len(train_data) // bptt

2928

In [None]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

In [51]:
src_mask = model.generate_square_subsequent_mask(bptt)

In [52]:
output = model(data, src_mask)

In [53]:
output.shape

torch.Size([35, 20, 28783])

In [55]:
output.view(-1, ntokens).shape

torch.Size([700, 28783])

In [35]:
list(range(0, train_data.size(0) - 1, bptt))

[0,
 35,
 70,
 105,
 140,
 175,
 210,
 245,
 280,
 315,
 350,
 385,
 420,
 455,
 490,
 525,
 560,
 595,
 630,
 665,
 700,
 735,
 770,
 805,
 840,
 875,
 910,
 945,
 980,
 1015,
 1050,
 1085,
 1120,
 1155,
 1190,
 1225,
 1260,
 1295,
 1330,
 1365,
 1400,
 1435,
 1470,
 1505,
 1540,
 1575,
 1610,
 1645,
 1680,
 1715,
 1750,
 1785,
 1820,
 1855,
 1890,
 1925,
 1960,
 1995,
 2030,
 2065,
 2100,
 2135,
 2170,
 2205,
 2240,
 2275,
 2310,
 2345,
 2380,
 2415,
 2450,
 2485,
 2520,
 2555,
 2590,
 2625,
 2660,
 2695,
 2730,
 2765,
 2800,
 2835,
 2870,
 2905,
 2940,
 2975,
 3010,
 3045,
 3080,
 3115,
 3150,
 3185,
 3220,
 3255,
 3290,
 3325,
 3360,
 3395,
 3430,
 3465,
 3500,
 3535,
 3570,
 3605,
 3640,
 3675,
 3710,
 3745,
 3780,
 3815,
 3850,
 3885,
 3920,
 3955,
 3990,
 4025,
 4060,
 4095,
 4130,
 4165,
 4200,
 4235,
 4270,
 4305,
 4340,
 4375,
 4410,
 4445,
 4480,
 4515,
 4550,
 4585,
 4620,
 4655,
 4690,
 4725,
 4760,
 4795,
 4830,
 4865,
 4900,
 4935,
 4970,
 5005,
 5040,
 5075,
 5110,
 514

In [14]:
train_data.shape

torch.Size([102499, 20])

In [81]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fe6a2e89fa0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'of': 5,
             'and': 6,
             'in': 7,
             'to': 8,
             'a': 9,
             '=': 10,
             'was': 11,
             "'": 12,
             '@-@': 13,
             'on': 14,
             'as': 15,
             's': 16,
             'that': 17,
             'for': 18,
             'with': 19,
             'by': 20,
             ')': 21,
             '(': 22,
             '@': 23,
             'is': 24,
             'it': 25,
             'from': 26,
             'at': 27,
             'his': 28,
             'he': 29,
             'were': 30,
             'an': 31,
             'had': 32,
             'which': 33,
             'be': 34,
             'are': 35,
             'this': 36,
             'their': 37,
          

In [None]:
get_batch

-1

In [73]:
batchify(torch.tensor(torch.arange(100)), 5)

  batchify(torch.tensor(torch.arange(100)), 5)


tensor([[ 0, 20, 40, 60, 80],
        [ 1, 21, 41, 61, 81],
        [ 2, 22, 42, 62, 82],
        [ 3, 23, 43, 63, 83],
        [ 4, 24, 44, 64, 84],
        [ 5, 25, 45, 65, 85],
        [ 6, 26, 46, 66, 86],
        [ 7, 27, 47, 67, 87],
        [ 8, 28, 48, 68, 88],
        [ 9, 29, 49, 69, 89],
        [10, 30, 50, 70, 90],
        [11, 31, 51, 71, 91],
        [12, 32, 52, 72, 92],
        [13, 33, 53, 73, 93],
        [14, 34, 54, 74, 94],
        [15, 35, 55, 75, 95],
        [16, 36, 56, 76, 96],
        [17, 37, 57, 77, 97],
        [18, 38, 58, 78, 98],
        [19, 39, 59, 79, 99]])

In [71]:
t.contiguous

tensor(1)

In [61]:
train_data.shape

torch.Size([2049990])

In [62]:
vocab['=']

10

In [63]:
vocab.stoi['=']

10

In [55]:
list((filter(lambda t: t > 0, [-1, 0, 1,2,3])))

[1, 2, 3]

In [49]:
t = torch.tensor(1)

In [51]:
t.numel()

1

In [47]:
tuple(filter(lambda t: t.numel() > 0, data))

NameError: name 'data' is not defined

In [42]:
vocab['=']

2

In [43]:
vocab.freqs

Counter({'=': 4, 'gameplay': 1})

In [44]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fe6a2e89160>>,
            {'<unk>': 0, '<pad>': 1, '=': 2, 'gameplay': 3})

In [45]:
vocab.itos

['<unk>', '<pad>', '=', 'gameplay']

In [33]:
counter.update(tokenizer(next(train_iter)))

In [39]:
counter.most_common()

[('=', 4), ('gameplay', 1)]

In [7]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, dim_feedforward=4096)
src = torch.rand(10, 32, 512)
out = encoder_layer(src)

In [12]:
out.shape

torch.Size([10, 32, 512])

In [9]:
>>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
>>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
>>> src = torch.rand(10, 32, 512)
>>> out = transformer_encoder(src)

In [8]:
out.shape

torch.Size([10, 32, 512])

In [14]:
sz = 3

In [15]:
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

In [17]:
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

In [None]:
mask.masked_fill_