<a href="https://colab.research.google.com/github/s-ravi18/LLMs-From-Scratch/blob/main/Attention_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Input Embeddings
#    ↓
# Multi-Head Self Attention
#    ↓ (+ residual)
# LayerNorm
#    ↓
# Feed Forward Network
#    ↓ (+ residual)
# LayerNorm

In [24]:
import torch
import torch.nn as nn

In [25]:
# initialisation of parameters


In [36]:
embed_size = 784
vocab_size = 50000
embed_layer = nn.Embedding(vocab_size, embed_size)
## Embedding layer --> dimension size is 784

In [27]:
import re
import string

def clean_text(text: str) -> str:
    """
    Basic text preprocessing:
    - Lowercase
    - Remove URLs
    - Remove HTML tags
    - Remove punctuation
    - Remove extra whitespaces
    """

    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [28]:
sample = ["The girl is sitting in the room."]


In [29]:
clean_text = clean_text(sample[0])

In [30]:
clean_text

'the girl is sitting in the room'

In [31]:
token_to_id = {i:ind for ind, i in enumerate(clean_text.split(' '))}
id_to_token = {ind:i for ind, i in enumerate(clean_text.split(' '))}

In [32]:
clean_text_tokenised = torch.tensor([token_to_id[i] for i in clean_text.split(' ')])

In [33]:
clean_text_tokenised

tensor([5, 1, 2, 3, 4, 5, 6])

In [37]:
res = embed_layer(clean_text_tokenised)

In [38]:
res

tensor([[ 1.4535e+00,  1.4793e-03, -6.2106e-01,  ..., -2.3350e-01,
         -1.1471e+00,  4.8996e-01],
        [-1.5351e+00, -2.1321e-01,  6.8851e-02,  ..., -2.0872e-01,
          6.7512e-01, -3.2629e-01],
        [-1.0183e+00, -5.0319e-01,  1.7632e-01,  ...,  2.0079e-02,
         -5.8421e-01, -2.9888e-01],
        ...,
        [ 1.9812e+00, -1.0320e+00,  8.9632e-01,  ...,  1.0867e+00,
         -4.7835e-01,  1.1864e+00],
        [ 1.4535e+00,  1.4793e-03, -6.2106e-01,  ..., -2.3350e-01,
         -1.1471e+00,  4.8996e-01],
        [ 4.0768e-01, -1.0015e+00,  3.6183e-02,  ...,  1.5646e+00,
         -1.5207e-01,  1.1371e+00]], grad_fn=<EmbeddingBackward0>)

In [None]:
# 1, batch_size = 5, max_seq_length, embedding_size

In [93]:
## Simple attention layer without positional encoding;
import torch.nn.init as init
import torch.nn.functional as F
import math

class attention_layer(nn.Module):

  def __init__(self, n_heads, embed_size):

    super().__init__()

    self.n_heads = n_heads
    self.attn_size = embed_size // n_heads
    self.scale = self.attn_size ** 0.5
    self.embed_size = embed_size

    ## Ensuring He initialisation;
    # Linear layers to project input → Q, K, V
    self.W_Q = nn.Linear(self.embed_size, self.embed_size)
    self.W_K = nn.Linear(self.embed_size, self.embed_size)
    self.W_V = nn.Linear(self.embed_size, self.embed_size)

    # Output linear projection
    self.W_O = nn.Linear(self.embed_size, self.embed_size)

  # def reset_parameters(self):
  #     # Apply He initialization
  #     init.kaiming_normal_(self.w, mode='fan_in', nonlinearity='relu')


  def forward(self, X):  ### X --> (batch_size, seq_len, embed_size)

    batch_size = X.shape[0]
    seq_len = X.shape[1]

    K = self.W_K(X)  ## batch_size, seq_len, embed_size
    Q = self.W_Q(X)
    V = self.W_V(X)

    ## Reshape + transpose;

    K = K.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)  ## batch_size, seq_len, n_heads, attn_size ---> batch_size, n_heads, seq_len, attn_size
    Q = Q.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)
    V = V.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)

    ## calculating attention scores for the batch; batch_size, seq_len, seq_len

    att_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

    ## batch_size, n_heads, seq_len, seq_len -->

    ## Softmax,
    weights = F.softmax(att_scores, dim=-1)  ## batch_size, n_heads, seq_len, seq_len

    ## V --> batch_size, n_heads, seq_len, attn_size
    ## weights --> batch_size, n_heads, seq_len, seq_len
    V = torch.matmul(weights, V)  ## batch_size, n_heads, seq_len, attn_size


    ## Reshaping the output - to concatenate the heads;
    V = V.transpose(1, 2).reshape(batch_size, seq_len, -1)  ## batch_size, seq_len, embed_size

    ## Forward
    V = self.W_O(V)

    return V



In [94]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embed_size, n_heads, hidden_dim):

    super().__init__()

    ## Embedding Generation;
    self.e1 = nn.Embedding(vocab_size, embed_size)

    ## Attention layers - Encoder;
    self.multi_head_count = 1   ## 12 in the case of GPT
    self.attn_layer = nn.ModuleList([attention_layer(n_heads, embed_size) for i in range(self.multi_head_count)])

    self.net = nn.Sequential(
        nn.Linear(embed_size, hidden_dim),
        nn.GELU(),
        nn.Linear(hidden_dim, embed_size),
    )

    self.ln1 = nn.LayerNorm(embed_size)
    self.ln2 = nn.LayerNorm(embed_size)


  def forward(self, X):

    ## Convert into its embedding;
    X = self.e1(X)

    for layer in self.attn_layer:
      X = layer(X) + X  ##
      X = self.ln1(X)

    X = self.net(X) + X ## feed forward NN
    X = self.ln2(X)

    return X



In [95]:
n_heads = 8
hidden_dim = 2048
embed_size = 784

encoder = Encoder(vocab_size, embed_size, n_heads, hidden_dim)

In [96]:
X = clean_text_tokenised

In [97]:
X = X.view(1, -1)

In [98]:
X.shape

torch.Size([1, 7])

In [99]:
encoder(X)

tensor([[[ 1.0093,  0.2547,  0.4176,  ...,  0.3115,  0.2546, -0.0553],
         [ 0.6095, -1.1749,  0.9938,  ...,  1.7051, -1.7248, -0.4859],
         [ 0.0431,  0.5581, -2.1931,  ...,  0.8225, -0.4722, -0.2956],
         ...,
         [ 0.8295, -0.3770,  0.3906,  ..., -0.3801, -0.9384,  0.1321],
         [ 1.0093,  0.2547,  0.4176,  ...,  0.3115,  0.2546, -0.0553],
         [ 0.9119,  0.0678, -0.9759,  ..., -2.1345, -0.6113, -0.0054]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
## WIP; Need to implement the Masked Self Attention for the decoder.

class Decoder(nn.Module):
  def __init__(self, vocab_size, embed_size, n_heads, hidden_dim):

    super().__init__()

    ## Attention layers - Encoder;
    self.multi_head_count = 1   ## 12 in the case of GPT
    self.attn_layer = nn.ModuleList([attention_layer(n_heads, embed_size) for i in range(self.multi_head_count)])

    self.net = nn.Sequential(
        nn.Linear(embed_size, hidden_dim),
        nn.GELU(),
        nn.Linear(hidden_dim, embed_size),
    )

    self.ln1 = nn.LayerNorm(embed_size)


  def forward(self, X):

    for layer in self.attn_layer:
      X = layer(X) + X  ##
      X = self.ln1(X)

    X = self.net(X)  ## feed forward NN
    X = self.ln1(X)

    return X



In [15]:
t = torch.tensor([[[1,2,3],
                  [1,2,3]],
                 [[1,2,3],
                  [1,2,3]]], dtype = float)

In [16]:
import torch.nn.functional as F

In [20]:
F.softmax(t, dim=-2)

tensor([[[0.5000, 0.5000, 0.5000],
         [0.5000, 0.5000, 0.5000]],

        [[0.5000, 0.5000, 0.5000],
         [0.5000, 0.5000, 0.5000]]], dtype=torch.float64)

In [11]:
0.0900 + 0.2447 + 0.6652

0.9999