<a href="https://colab.research.google.com/github/shiwangi27/googlecolab/blob/main/transformer_pytorch_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### PyTorch for building and training Transformer NLP models from scratch. 

This is an implementation of Transformers from the [Attention is All You Need](https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) paper by Vaswani et al. 

For this implementation, I have referred Andrej Karpathy's [minGPT](https://github.com/karpathy/minGPT/blob/master/mingpt).

I have tried explaining each step in comments for better understanding.

In [1]:
import math
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F 

In [9]:
X_train_embeddings = torch.rand(size=(32, 128, 512), dtype=torch.float32, device="cpu") 

In [2]:
class Config:
  pass

In [34]:
class MultiHeadSelfAttention(nn.Module):
  """
  Multi-head Self-attention is the learnt subspace representation of a sequence
  through linear projections of scaled dot product of attentions.
  multi_head_self_attention = softmax (k . q / √d) * v 
  """
  def __init__(self, embed_dim: int, 
               num_heads: int,
               activation: nn.functional = F.relu, 
               bias: bool = True, 
               attention_dropout=0.1, 
               residual_dropout=0.1, 
               mask: bool = False) -> Tensor:
    super(MultiHeadSelfAttention, self).__init__()

    self.num_heads = num_heads
    self.embed_dim = embed_dim
    self.activation = activation
    self.bias = bias
    self.mask = mask

    # Define Query, Key and Value as learnable linear layers
    self.linear_q = nn.Linear(embed_dim, embed_dim)
    self.linear_k = nn.Linear(embed_dim, embed_dim)
    self.linear_v = nn.Linear(embed_dim, embed_dim)

    # Define Dropout to add regularization to attention or to residual
    self.att_dropout = nn.Dropout(p=attention_dropout)
    self.res_dropout = nn.Dropout(p=residual_dropout)

    # Define a Linear Projection layer.   
    self.out_projection = nn.Linear(embed_dim, embed_dim)
  
  def _reshape_to_subspace(self, x):
    # If the shape of the input was (batch_size, seq_len, embed_dim) = (32, 128, 512). 
    # For 8 heads, we first reshape it to (32, 128, 8, 64) and then (32*8, 128, 64). 

    batch_size, seq_len, embed_dim = x.size()
    subspace_dim = embed_dim // self.num_heads
    new_batch_size = batch_size * self.num_heads

    return x.reshape(batch_size, seq_len, self.num_heads, subspace_dim)\
            .permute((0, 2, 1, 3))\
            .reshape(new_batch_size, seq_len, subspace_dim)

  def _reshape_from_subspace(self, x):
    # Here we do the reverse of the above to get back the full embed dim after subspace learning.  

    new_batch_size, seq_len, subspace_dim = x.size()
    embed_dim = subspace_dim * self.num_heads
    batch_size = new_batch_size // self.num_heads

    return x.reshape(batch_size, self.num_heads, seq_len, subspace_dim)\
            .permute(0, 2, 1, 3)\
            .reshape(batch_size, seq_len, embed_dim)

  def forward(self, X):
    # Query, Key, Value vectors 
    q = self.linear_q(X)
    k = self.linear_k(X)
    v = self.linear_v(X)
  
    # Reshape the embed dim to subspace representation, splitting the sequence into multiple heads. 
    q = self._reshape_to_subspace(q)
    k = self._reshape_to_subspace(k)
    v = self._reshape_to_subspace(v)

    # Scaling by inverse square root of the embed dimension is empirically found really effective.  
    d = q.size(-1)
    
    # Scaled dot product attentions (optionally masked self attentions for architectures like GPT-2.)
    scaled_dot_product = q.matmul(k.transpose(-2, -1)) / math.sqrt(d)
    if self.mask:
      scaled_dot_product = scaled_dot_product.masked_fill(mask[:,:,:seq_len,:seq_len] == 0, float('-inf'))

    attention = F.softmax(scaled_dot_product, dim=-1)
    
    # Attention dropout is applied to the output of the softmax.
    attention = self.att_dropout(attention)
    
    y = attention.matmul(v)

    # Reshape the subspace learned representation back to the full embed dimension.
    y = self._reshape_from_subspace(y)

    # Linear projection of the output from each of the multi heads. 
    y = self.out_projection(y)
    
    # residual dropout - it's the layer dropout before passing the output to the next layer. 
    y = self.res_dropout(y)

    return y


In [35]:
multi_head_self_attention = MultiHeadSelfAttention(embed_dim=512, num_heads=8)
attentions = multi_head_self_attention(X_train_embeddings)

In [36]:
attentions

tensor([[[ 0.0205,  0.1748,  0.0574,  ...,  0.0205, -0.3576,  0.0832],
         [ 0.0145,  0.1700,  0.0703,  ...,  0.0160, -0.3584,  0.0752],
         [ 0.0190,  0.1771,  0.0671,  ...,  0.0135, -0.3471,  0.0743],
         ...,
         [ 0.0248,  0.1696,  0.0598,  ...,  0.0155, -0.3576,  0.0000],
         [ 0.0222,  0.1853,  0.0575,  ...,  0.0000, -0.3604,  0.0775],
         [ 0.0231,  0.1779,  0.0668,  ...,  0.0000, -0.3434,  0.0757]],

        [[ 0.0390,  0.0000,  0.0657,  ...,  0.0471, -0.3786,  0.0786],
         [ 0.0316,  0.2001,  0.0631,  ...,  0.0403, -0.3645,  0.0845],
         [ 0.0000,  0.2026,  0.0628,  ...,  0.0330, -0.0000,  0.0879],
         ...,
         [ 0.0273,  0.1791,  0.0660,  ...,  0.0372, -0.3662,  0.0860],
         [ 0.0232,  0.1913,  0.0711,  ...,  0.0373, -0.3744,  0.0686],
         [ 0.0224,  0.1874,  0.0664,  ...,  0.0442, -0.3709,  0.0843]],

        [[ 0.0351,  0.2084,  0.0697,  ...,  0.0041, -0.3639,  0.0711],
         [ 0.0210,  0.1926,  0.0841,  ...,  0

In [4]:
class TransformerBlock(nn.Module):
  def __init__(self):
    super(TransformerBlock, self).__init__()
  def forward(self):
    pass

In [3]:
class TransformerModel(nn.Module):
  def __init__(self):
    super(BertModel, self).__init__()
  def forward(self):
    pass