Having gone through all the key components, we can construct a GPT style autoregressive Transformer model by simply stacking blocks. 

In [None]:
  class Block(nn.Module):
    def __init__(self, emb_dim):
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 2*emb_dim),
            nn.ReLU(),
            nn.Linear(2*emb_dim, emb_dim) ) # can be any mlp, this is one simple example
        self.emb_dim = emb_dim


    def self_attention(self, x, emb_dim,):
        M_K, M_Q, M_V = [nn.Linear(emb_dim, emb_dim, bias=False) for _ in range(3)]
        K, Q, V = [M(x) for M in [M_K, M_Q, M_V ]]
        W_raw = Q@(K.transpose(-1,-2))
        # == masking begins ==
        ones = torch.ones((seq_len, seq_len), dtype=torch.uint8)
        mask = torch.triu(ones, diagonal=1)
        W_raw[mask] = float('-inf')
        # == masking ends ==
        W = F.softmax(W_raw, dim=-1)
        Y = W@V
        return Y



    def forward(x):
        x = x + self.self_attention(x, self.emb_dim)
        x_ln1 = self.ln1(x)
        x_mlp = self.mlp(x)
        x = x_ln1 + x_mlp
        x = ln2(x)
        return x 


class Transformer(nn.Module):
    def __init__(self, emb_dim, vocab_size, num_blocks, num_classes):
        self.blocks = nn.Sequential(*[Block(emb_dim) for _ in num_blocks])
        self.word_embedding = nn.Embedding(vocab_size, emb_dim)
        self.head = nn.Linear(emb_dim, vocab_size)


    def positional_encoding(self, max_len, d_model):
    '''
    Computes positional embedding vectors deterministically with sin and cos
    max_len: number of positions, i.e. input seq. length
    d_model: embedding dimensiion
    
    CAVEAT/WARNING: the embedding dimension must be even, as dictated by the formula
    '''
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
    #     pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    
    def forward(self):
        seq_len = len(x[0]) #assume first dimension is for batching
        pe = self.positional_encoding(seq_len, self.emb_dim)
        x = self.word_embedding(x) + pe 
        x = self.blocks(x) 
        x = self.head(x)
        x = F.softmax(x, dim=-1)
        return x

In [1]:
import torch
import torch.nn as nn

emb_dim = 4
seq_len = 5

#input tokens:
x = torch.tensor([0,1,2,3,4]) 
embedding = nn.Embedding(20, emb_dim) #vocab size 20, emb dim 4
x = embedding(x)

In [2]:
x

tensor([[-1.5961,  1.6146, -0.5593,  1.3978],
        [ 0.0600, -1.4765,  0.3478, -0.6228],
        [-0.2879,  1.2764,  0.3876, -0.1122],
        [-1.6582,  0.9503, -1.5498, -1.7656],
        [ 0.5006,  0.4357, -1.1175, -1.0587]], grad_fn=<EmbeddingBackward>)

In [3]:
x.mean(dim=0)

tensor([-0.5963,  0.5601, -0.4982, -0.4323], grad_fn=<MeanBackward1>)

In [4]:
x.mean(dim=1)

tensor([ 0.2142, -0.4229,  0.3160, -1.0058, -0.3100], grad_fn=<MeanBackward1>)

In [7]:
import torch.nn.functional as F

F.log_softmax(x, dim=1)

tensor([[-3.8832, -0.6725, -2.8464, -0.8893],
        [-1.1164, -2.6529, -0.8286, -1.7992],
        [-2.1902, -0.6258, -1.5147, -2.0144],
        [-2.8089, -0.2004, -2.7004, -2.9163],
        [-0.8526, -0.9175, -2.4707, -2.4119]], grad_fn=<LogSoftmaxBackward>)

In [12]:
y = F.softmax(x, dim=-1)
y

tensor([[0.0206, 0.5104, 0.0581, 0.4109],
        [0.3274, 0.0704, 0.4367, 0.1654],
        [0.1119, 0.5348, 0.2199, 0.1334],
        [0.0603, 0.8184, 0.0672, 0.0541],
        [0.4263, 0.3995, 0.0845, 0.0896]], grad_fn=<SoftmaxBackward>)

In [13]:
torch.sum(y, dim=0)

tensor([0.9465, 2.3336, 0.8663, 0.8536], grad_fn=<SumBackward1>)