# Transformer from Scratch

<img src="transformer_original.png"/>

# Terminologies

## Encoder
- Maps discrete numeric tokens to continous dense representation encoding semantic information.

## Decoder
- Produces a sequence given the input/dense representation generating one token at a time.

## Attention 
- Dynamically adjusts token representations by adding up the weighted vectors of the token present in its vicinity.

## Multi-Head Attention
- Works similarly to having multiple convolution kernels each learning different semantics at the feature-level.

In [64]:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import copy
import math
import pandas as pd
import altair as alt

<img src="transformer_all.png"/>

In [22]:
class Transformer(nn.Module):
    def __init__(self, src_embed, tgt_embed, encoder, decoder):
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, src_mask, tgt, tgt_mask ):
        h = self.encoder(self.src_embed(src), src_mask)
        y = self.decoder(self.tgt_embed(tgt), h, src_mask, tgt_mask)
        return y
   

<img src="transformer_original_Nx.png"/>

In [23]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

<img src="transformer_norm.png"/>

In [42]:
class LayerNorm(nn.Module):
    def __init__(self, num_features, epsilon=1e-9):
            self.gamma = nn.Parameter(torch.ones(features))
            self.beta = nn.Parameter(torch.zeros(features))
            self.epsilon = epsilon
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        x = self.gamma * ((x-mean)/math.sqrt(std + self.epsilon)) + self.beta
        return x

This can act as an encoder layer or a decoder layer.

🗒 Some implementations, including the paper seem to have differences in where the layer-normalization is done. Here we do a layer normalization before attention and feed-forward networks, and add the original residual vectors. Alternative is to do a layer normalization after adding the residuals. But we found this to be less stable when training. We found a detailed discussion about this in the paper [On Layer Normalization in the Transformer Architecture](https://papers.labml.ai/paper/2002.04745).

In [49]:
class SublayerConnection(nn.Module):
    def __init__(self, num_features, dropout):
            self.norm = LayerNorm(num_features)
            self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, layer):
        return x + self.dropout(layer(self.norm(x)))

<img src="transformer_encoder.png"/>

In [51]:
class EncoderLayer(nn.Module):
    def __init__(self, attention, ff_layer, d_model, dropout):
            self.layer_connection = clones(SublayerConnection(num_features=d_model, dropout=dropout), 2)
            self.ff_layer = ff_layer
            self.attention = attention
    
    def forward(self, x, mask):
        x = self.layer_connection[0](x, lambda x: self.attention(x, x, x, mask))
        return self.layer_connection[1](x, self.ff_layer)

In [52]:
class Encoder(nn.Module):
    def __init__(self, layer, num_layers):
        self.encoder_layers = clones(layer, num_layers)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        for layer in self.encoder_layers:
            x = layer(x, mask)
        return self.norm(x)

<img src="transformer_decoder.png"/>

In [53]:
class Decoder(nn.Module):
    def __init__(self, layer, num_layers):
        self.decoder_layers = clones(layer, num_layers)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.decoder_layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [54]:
class DecoderLayer(nn.Module):
    def __init__(self, self_attention, cross_attention, ff_layer, d_model, dropout):
            self.layer_connection = clones(SublayerConnection(num_features=d_model, dropout=dropout), 3)
            self.ff_layer = ff_layer
            self.self_attention = self_attention
            self.cross_attention = cross_attention
    
    def forward(self, x, memory, src_mask, tgt_mask):
        x = self.layer_connection[0](x, lambda x: self.attention(x, x, x, tgt_mask))
        x = self.layer_connection[1](x, lambda x: self.cross_attention(x, memory, memory, src_mask))
        return self.layer_connection[2](x, self.ff_layer)

In [29]:
import torch

torch.tril(torch.ones(5,5))

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [116]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [68]:
RUN_EXAMPLES = True

def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)
    
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                    "Window": y,
                    "Masking": x,
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )


show_example(example_mask)#(subsequent_mask(5))

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

# Positional Embedding Characteristics:

### Source: [Positional Encoding in Transformer Neural Networks Explained](https://www.youtube.com/watch?v=ZMxVe-HK174&list=PLTl9hO2Oobd97qfWC40gOSU8C0iu0m2l4&index=3&ab_channel=CodeEmporium)

1. Periodicity: Token can periodically attend to neighboring tokens i.e. ith token would be able to attend more to i*k, i*2k, ...., i*nk. The wavelengths form a geometric progression from 2π to 10000 x 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, PE_pos+k can be represented as a linear function of PE_pos.

2. Constrained values: The values are constrained (assume monotonically decrease as we move forward in the sequence).
3. Easy to extraplote to long sequence which are not even seen in the training set.



In [152]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [None]:
# Saarland uni is
# is Saarland uni

#5, 512

# Saarland -> [0.1, 0.33, .., indx=511]