<a href="https://colab.research.google.com/github/s-ravi18/LLMs-From-Scratch/blob/main/Attention_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Input Embeddings
#    ↓
# Multi-Head Self Attention
#    ↓ (+ residual)
# LayerNorm
#    ↓
# Feed Forward Network
#    ↓ (+ residual)
# LayerNorm

In [1]:
import torch
import torch.nn as nn

In [None]:
# initialisation of parameters


In [2]:
embed_size = 784
vocab_size = 50000
e = nn.Embedding(vocab_size, embed_size)
## Embedding layer --> dimension size is 784

In [15]:
import re
import string

def clean_text(text: str) -> str:
    """
    Basic text preprocessing:
    - Lowercase
    - Remove URLs
    - Remove HTML tags
    - Remove punctuation
    - Remove extra whitespaces
    """

    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [16]:
sample = ["The girl is sitting in the room."]


In [17]:
clean_text = clean_text(sample[0])

In [18]:
clean_text

'the girl is sitting in the room'

In [21]:
token_to_id = {i:ind for ind, i in enumerate(clean_text.split(' '))}
id_to_token = {ind:i for ind, i in enumerate(clean_text.split(' '))}

In [26]:
clean_text_tokenised = torch.tensor([token_to_id[i] for i in clean_text.split(' ')])

In [27]:
clean_text_tokenised

tensor([5, 1, 2, 3, 4, 5, 6])

In [29]:
res = e(clean_text_tokenised)

In [30]:
res

tensor([[ 3.4831e-01, -1.7372e-01,  1.6457e+00,  ..., -1.2697e+00,
         -3.5481e-04, -2.0473e+00],
        [ 6.4851e-01,  7.5638e-01, -5.0419e-01,  ..., -7.6762e-01,
         -1.0876e-01,  3.9190e-01],
        [ 1.3516e+00, -1.0577e+00, -2.3979e+00,  ...,  4.4797e-01,
         -5.1802e-01, -7.5275e-01],
        ...,
        [ 5.1763e-01,  5.5260e-01,  1.1672e+00,  ...,  4.0926e-01,
          3.6867e+00,  2.0453e-01],
        [ 3.4831e-01, -1.7372e-01,  1.6457e+00,  ..., -1.2697e+00,
         -3.5481e-04, -2.0473e+00],
        [-1.1794e-01,  1.4702e+00, -3.1845e-01,  ..., -2.2812e-03,
          1.9396e+00,  1.6642e-01]], grad_fn=<EmbeddingBackward0>)

In [None]:
# 1, batch_size = 5, max_seq_length, embedding_size

In [21]:
## Simple attention layer without positional encoding;
import torch.nn.init as init
import torch.nn.functional as F
import math

class attention_layer(nn.Module):

  def __init__(self, n_heads, embed_size):

    self.reset_parameters()
    self.n_heads = n_heads
    self.attn_size = embed_size // n_heads
    self.scale = self.attn_size ** 0.5
    self.embed_size = embed_size

    ## Ensuring He initialisation;
    # Linear layers to project input → Q, K, V
    self.W_Q = nn.Linear(self.embed_size, self.embed_size)
    self.W_K = nn.Linear(self.embed_size, self.embed_size)
    self.W_V = nn.Linear(self.embed_size, self.embed_size)

    # Output linear projection
    self.W_O = nn.Linear(self.embed_size, self.embed_size)

  # def reset_parameters(self):
  #     # Apply He initialization
  #     init.kaiming_normal_(self.w, mode='fan_in', nonlinearity='relu')


  def forward(self, X):  ### X --> (batch_size, seq_len, embed_size)

    batch_size = X.shape[0]
    seq_len = X.shape[1]

    K = X @ self.Wk  ## batch_size, seq_len, embed_size
    Q = X @ self.Wq
    V = X @ self.Wv

    ## Reshape + transpose;

    K = K.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)  ## batch_size, seq_len, n_heads, attn_size ---> batch_size, n_heads, seq_len, attn_size
    Q = Q.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)
    V = V.reshape(batch_size, seq_len, self.n_heads, self.attn_size).transpose(-3, -2)

    ## calculating attention scores for the batch; batch_size, seq_len, seq_len

    att_scores = torch.matmul(Q, K.transpose(-2, -1)) // self.scale

    ## batch_size, n_heads, seq_len, seq_len -->

    ## Softmax,
    weights = F.softmax(att_scores, dim=-1)  ## batch_size, n_heads, seq_len, seq_len

    ## V --> batch_size, n_heads, seq_len, attn_size
    ## weights --> batch_size, n_heads, seq_len, seq_len
    V = torch.matmul(weights, V)  ## batch_size, n_heads, seq_len, attn_size


    ## Reshaping the output - to concatenate the heads;
    V = V.transpose(1, 2).reshape(batch_size, seq_len, -1)  ## batch_size, seq_len, embed_size

    ## Forward
    V = self.W_O(V)

    return V



In [None]:
class model_architecture(nn.Module):
  def __init__(self, vocab_size, embed_size, n_heads, hidden_dim):

    super().__init__()

    ## Embedding Generation;
    self.e1 = nn.Embedding(vocab_size, embed_size)

    ## Attention layers - Encoder;
    self.multi_head_count = 1   ## 12 in the case of GPT
    self.attn_layer = nn.ModuleList([attention_layer(n_heads, embed_size) for i in range(self.multi_head_count)])

    self.net = nn.Sequential(
        nn.Linear(embed_size, hidden_dim),
        nn.GELU(),
        nn.Linear(hidden_dim, embed_size),
    )

    self.ln1 = nn.LayerNorm(embed_size)


  def forward(self, X):

    for layer in self.attn_layer:
      X = layer(X) + X  ##
      X = self.ln1(X)

    X = self.net(X)  ## feed forward NN
    X = self.ln1(X)

    return X



In [15]:
t = torch.tensor([[[1,2,3],
                  [1,2,3]],
                 [[1,2,3],
                  [1,2,3]]], dtype = float)

In [16]:
import torch.nn.functional as F

In [20]:
F.softmax(t, dim=-2)

tensor([[[0.5000, 0.5000, 0.5000],
         [0.5000, 0.5000, 0.5000]],

        [[0.5000, 0.5000, 0.5000],
         [0.5000, 0.5000, 0.5000]]], dtype=torch.float64)

In [11]:
0.0900 + 0.2447 + 0.6652

0.9999