In [4]:
import math

#we use pytorch when parameters require complex cmputations. Helps in neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
#Give an input data
sentence = "My name is Sajag mathur. He likes classical music."     

# Tokenize words and provide user defined values. We will use a user defined function for the same

In [6]:
def tokenize(text: str, vocab_size: int) -> torch.Tensor:
    """Dummy text tokenizer."""
    #break into chunks
    words = text.split(" ")
    #give integer value to each chunk - this is for testing. In case you want consistant values, you replace by huggingface transformer and integer values
    return torch.randint(0, vocab_size, [len(words)])

#give a hyperparameter called vocab_size --> this means that total number of unique tokens that model can recognize and process is 20000.
VOCAB_SIZE = 20000

tokenized_sentence = tokenize(sentence, VOCAB_SIZE)
n_tokens = len(tokenized_sentence)
tokenized_sentence

tensor([14891, 10332,   193, 13001,  3627,  3726,  7516,  7895, 19014])

# And embed each token into a vector space using PyTorch's [torch.nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding) module.


This image shows how the embedding layer in a neural network (like a transformer) works:

1. Input tokens (such as words or subwords) are represented as symbols (like “V”, “B”, etc.) on the right.

2. Each token is mapped to a unique vector (a row of numbers), seen as colored circles and columns in the middle.

3. The embedding layer transforms each token into its corresponding numerical vector (embedding), stacking these vectors together to form the input for the rest of the model.

In short: The embedding layer turns text tokens into high-dimensional numeric representations that the model can understand and process further.


In [7]:
#Generate Embeddings and convert input tokens into vectors --> This is run through neural network.embedding
# Vocabulary Size and Dimension of Embeddings --> you tokenize the sentence
EMBEDDING_DIM = 32

embedding_layer = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
embedded_tokens = embedding_layer(tokenized_sentence)
embedded_tokens.shape

#Torch.size(size of input data, embedding dimension)

torch.Size([9, 32])

In [8]:

embedded_tokens

tensor([[-1.0857e+00,  1.0020e+00, -2.3250e-01,  1.4590e+00,  1.2314e+00,
         -5.9862e-01, -1.2298e-01,  4.9957e-01, -4.1830e-01, -2.0499e+00,
         -9.5629e-01,  5.8214e-01,  5.1003e-01,  7.7742e-01,  2.9274e-01,
          1.2250e-01,  4.7286e-01,  3.8568e-01,  6.8314e-01, -9.0941e-02,
         -1.5446e-01,  6.5623e-01,  1.2459e+00,  5.5247e-01,  8.9871e-01,
          3.5293e-01,  1.5774e+00, -7.0908e-01,  2.2781e+00,  1.7482e-01,
         -7.8116e-01, -3.4698e-01],
        [-1.1850e+00, -3.2007e-01, -2.4116e+00, -2.3570e-02, -4.8931e-01,
         -1.2256e+00,  2.0615e+00,  1.2602e+00, -9.6038e-01, -3.2274e-02,
         -6.4393e-01,  5.9620e-01,  1.7711e-01,  1.1517e+00,  7.6437e-01,
          4.8272e-01,  1.0268e+00,  7.4599e-01, -1.0638e-01,  8.0193e-01,
         -8.3958e-01,  7.3929e-01,  5.1022e-01,  9.3816e-01, -2.2969e-01,
          6.1165e-01,  1.8311e-01, -1.1351e-01,  3.2022e-01, -2.0328e+00,
         -2.8329e-02, -1.2480e+00],
        [ 3.2033e-01,  1.7559e+00, -6.84

In [9]:
embedding_layer

Embedding(20000, 32)

In [10]:
#Vocabulary size - 20000 Embedding Size - 32
#Every time you run this, the tokens are changing -->Everytime you do this, the weights get optimized. At first iteration, it randomly initializes the vectors
#During training, the vectors change weights. Everytime model sees new sentense or similar sentence, the weight will change.
#You can also do pre trained embedding

 A higher embedding dimension allows the model to capture more complex and nuanced features of the data, but increases memory and computation requirements.

- A lower embedding dimension is more efficient, but might not capture all the important information, especially for complex tasks or large vocabularies.

These embeddings will need to be learnt when training any model that uses an embedding layer. We can easily compute the number of parameters that need to be learnt.

In [11]:
# How many total number of embeddings in our model? 20000 (Vocab Size)*Embedding Dimension (32)

n_embedding_params = sum(len(p.flatten()) for p in embedding_layer.parameters())
print(f"number of embedding parameters = {n_embedding_params:,}")

number of embedding parameters = 640,000


## Basic Self-Attention

An approach to computing attention is to express the new context-aware embeddings as a weighted linear combination or the input embeddings - e.g., $\vec{x_{i}} \to \vec{z_{i}} = \sum_{j=1}^{N}{a_{ij} \times \vec{x_{j}}}$. 

One sensible approach to computing the weights is to use the vector [dot product](https://en.wikipedia.org/wiki/Dot_product) between the embedding vectors - e.g., $a_{ij} = x_{i}^{T} \cdot x_{i}$. This will lead to weights that are higher for embedding vectors that are geometrically nearer to one another in the embedding space (i.e., are semantically closer), and vice versa.

In [12]:
# Compute the weight by using dot product between embedding vectors - random values: allocates uninitialized memory to tensor
torch.empty(n_tokens,n_tokens)

tensor([[2.3578e+26, 1.9912e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+0

Self-attention in transformers computes **how much one token should consider every other token’s information to build context-aware representations.** 

Below code computes the core **similarity scores (unnormalized attention weights) using dot products between embeddings.**

For each token 

i = (acting as a query), we calculate the dot product with every other token j's

j = embedding vector (acting as keys).

In [13]:
#for all tokens --> attention weight is dot product of each embedding token against any other embedding token

attn_weights = torch.empty(n_tokens, n_tokens)


for i in range(n_tokens):
    for j in range(n_tokens):
        attn_weights[i, j] = torch.dot(embedded_tokens[i], embedded_tokens[j])

attn_weights.shape

torch.Size([9, 9])

In [14]:
attn_weights

tensor([[26.1343,  8.3032, -1.3191, -4.6427,  6.6822,  5.4403, -1.2678,  1.9402,
         -0.4022],
        [ 8.3032, 29.6948, -7.4185, -0.3402, -3.0784,  3.4283,  4.1866,  3.2277,
          6.2666],
        [-1.3191, -7.4185, 22.4683, -0.2169, -2.1722, -0.0726, -8.7021, -5.5650,
         -9.5592],
        [-4.6427, -0.3402, -0.2169, 25.5506,  3.4426,  4.8543,  7.5582, -0.0986,
          5.0680],
        [ 6.6822, -3.0784, -2.1722,  3.4426, 29.5835,  1.6087,  4.6315,  9.4329,
          0.6769],
        [ 5.4403,  3.4283, -0.0726,  4.8543,  1.6087, 34.1028,  4.1000, -0.4884,
         -7.0386],
        [-1.2678,  4.1866, -8.7021,  7.5582,  4.6315,  4.1000, 24.8313,  3.0486,
          1.7539],
        [ 1.9402,  3.2277, -5.5650, -0.0986,  9.4329, -0.4884,  3.0486, 27.9390,
          4.2726],
        [-0.4022,  6.2666, -9.5592,  5.0680,  0.6769, -7.0386,  1.7539,  4.2726,
         31.5849]], grad_fn=<CopySlices>)

In [15]:
attn_weights

tensor([[26.1343,  8.3032, -1.3191, -4.6427,  6.6822,  5.4403, -1.2678,  1.9402,
         -0.4022],
        [ 8.3032, 29.6948, -7.4185, -0.3402, -3.0784,  3.4283,  4.1866,  3.2277,
          6.2666],
        [-1.3191, -7.4185, 22.4683, -0.2169, -2.1722, -0.0726, -8.7021, -5.5650,
         -9.5592],
        [-4.6427, -0.3402, -0.2169, 25.5506,  3.4426,  4.8543,  7.5582, -0.0986,
          5.0680],
        [ 6.6822, -3.0784, -2.1722,  3.4426, 29.5835,  1.6087,  4.6315,  9.4329,
          0.6769],
        [ 5.4403,  3.4283, -0.0726,  4.8543,  1.6087, 34.1028,  4.1000, -0.4884,
         -7.0386],
        [-1.2678,  4.1866, -8.7021,  7.5582,  4.6315,  4.1000, 24.8313,  3.0486,
          1.7539],
        [ 1.9402,  3.2277, -5.5650, -0.0986,  9.4329, -0.4884,  3.0486, 27.9390,
          4.2726],
        [-0.4022,  6.2666, -9.5592,  5.0680,  0.6769, -7.0386,  1.7539,  4.2726,
         31.5849]], grad_fn=<CopySlices>)

This calculation can also be computed more efficiently using matrix multiplication.


#### OR

In [16]:
attn_weights_matmul = torch.matmul(embedded_tokens, embedded_tokens.T)

In [17]:
#dot scale product between embedded tokens and their transpose

In [18]:
attn_weights_matmul

tensor([[26.1343,  8.3032, -1.3191, -4.6427,  6.6822,  5.4403, -1.2678,  1.9402,
         -0.4022],
        [ 8.3032, 29.6948, -7.4185, -0.3402, -3.0784,  3.4283,  4.1866,  3.2277,
          6.2666],
        [-1.3191, -7.4185, 22.4683, -0.2169, -2.1722, -0.0726, -8.7021, -5.5650,
         -9.5592],
        [-4.6427, -0.3402, -0.2169, 25.5506,  3.4426,  4.8543,  7.5582, -0.0986,
          5.0680],
        [ 6.6822, -3.0784, -2.1722,  3.4426, 29.5835,  1.6087,  4.6315,  9.4329,
          0.6769],
        [ 5.4403,  3.4283, -0.0726,  4.8543,  1.6087, 34.1028,  4.1000, -0.4884,
         -7.0386],
        [-1.2678,  4.1866, -8.7021,  7.5582,  4.6315,  4.1000, 24.8313,  3.0486,
          1.7539],
        [ 1.9402,  3.2277, -5.5650, -0.0986,  9.4329, -0.4884,  3.0486, 27.9390,
          4.2726],
        [-0.4022,  6.2666, -9.5592,  5.0680,  0.6769, -7.0386,  1.7539,  4.2726,
         31.5849]], grad_fn=<MmBackward0>)

And we can verify that the two approaches are equivalent.

#### VALIDATING

In [19]:
#check if both approaches are same or not
torch.allclose(attn_weights_matmul, attn_weights, atol=1e-6)

True

# Let's break down the key points:

#### "the weights are scaled by the embedding dimension":

1. In the context of self-attention (a core component of transformers), the attention mechanism calculates "attention scores" (or "logits") between a query and all keys. These scores determine how much focus each part of the input sequence should get.


2. Before applying the softmax function, these attention scores are typically divided by the square root of the embedding dimension 


3. Why? This scaling is crucial to prevent the dot products (which are used to calculate the attention scores) from becoming very large as the embedding dimension increases. When dot products become very large, they can push the softmax function into regions where its gradients are extremely small (saturate), which can lead to vanishing gradients and hinder the training process. Dividing by square root of d, helps to keep the variance of the dot products consistent, regardless of the embedding dimension size. This concept is detailed in the original "Attention Is All You Need" paper by Vaswani et al.



#### "and subsequently renormalised to sum to one across rows using the softmax function.":

1. After scaling, the attention scores are passed through a softmax function.


2. Why? The softmax function converts the raw attention scores into a probability distribution. This means that for each query, the attention weights assigned to all keys will be positive and sum up to 1. These normalized weights then determine how much each key contributes to the weighted sum that forms the attention output. This effectively creates a convex combination of the value vectors, where the weights indicate the "importance" of each value.


#### "Steps like these make models easier to train by normalising the magnitude of gradients used within algorithms like stochastic gradient descent.":

1. Normalization of Gradients: The scaling by square root of d and the use of softmax both contribute to a more stable training process.

2. Scaling: As mentioned, it prevents large attention scores from leading to saturated softmax outputs, which would result in near-zero gradients during backpropagation. By keeping the gradients in a reasonable range, the optimization algorithm (like Stochastic Gradient Descent - SGD) can make meaningful updates to the model's weights.

3. Softmax: By producing probabilities that sum to one, softmax ensures that the output of the attention mechanism is well-bounded. This helps in controlling the magnitude of activations and, consequently, the gradients flowing back through the network.

4. Easier Training: When gradients are well-behaved (not too large, not too small), optimizers can more effectively navigate the loss landscape, leading to faster convergence and better overall model performance. Issues like vanishing or exploding gradients can significantly impede or even halt the training process.


In summary, the described steps are fundamental to the stability and effectiveness of attention mechanisms in transformers. They address potential numerical issues that arise from large dot products, ensuring that the gradients during training remain in a healthy range, thus facilitating the training of these complex models with algorithms like SGD.

In [20]:
# Attention gives different weights to different tokens. Self attention gives focus to itself + other tokens
# Attention mechanism works on attention score:
# Attention Score = Q*(K^T)/sqrt(dk)
#Given in attention is all you need by Vaswani and other researchers
#Thus, attention score is calculated by using Query and product of transformatoin of K. This was statistically very high. 
#Model started having vanishing gradiant. To lower this, we started dividing it by square root of dimension of key matrix
# Then you normalize things using softmax functions
# Earlier we used raw similarity scores, but for a good model, we need attention scores - Let's calculate that. Here, we took raw weights and normalized
# it using softmax function and embedding dimension. All values between 0 and 1 because we have used softmax normalization

In [21]:
attn_weights_norm = F.softmax(attn_weights / math.sqrt(EMBEDDING_DIM), dim=1)
print(attn_weights_norm)
#attn_weights: This variable represents the raw, unnormalized attention scores (or "logits").
#Softmax converts the scaled attention scores into a probability distribution. The output values will be between 0 and 1,

tensor([[8.7434e-01, 3.7388e-02, 6.8233e-03, 3.7917e-03, 2.8072e-02, 2.2539e-02,
         6.8856e-03, 1.2140e-02, 8.0240e-03],
        [2.1139e-02, 9.2763e-01, 1.3124e-03, 4.5867e-03, 2.8267e-03, 8.9293e-03,
         1.0210e-02, 8.6182e-03, 1.4748e-02],
        [1.3762e-02, 4.6819e-03, 9.2241e-01, 1.6723e-02, 1.1836e-02, 1.7155e-02,
         3.7314e-03, 6.4970e-03, 3.2068e-03],
        [4.1790e-03, 8.9412e-03, 9.1381e-03, 8.6918e-01, 1.7450e-02, 2.2397e-02,
         3.6122e-02, 9.3312e-03, 2.3259e-02],
        [1.6043e-02, 2.8571e-03, 3.3535e-03, 9.0482e-03, 9.1935e-01, 6.5429e-03,
         1.1164e-02, 2.6089e-02, 5.5492e-03],
        [6.1195e-03, 4.2879e-03, 2.3093e-03, 5.5173e-03, 3.1085e-03, 9.7101e-01,
         4.8285e-03, 2.1456e-03, 6.7401e-04],
        [8.4190e-03, 2.2081e-02, 2.2621e-03, 4.0073e-02, 2.3887e-02, 2.1745e-02,
         8.4911e-01, 1.8057e-02, 1.4363e-02],
        [9.1374e-03, 1.1473e-02, 2.4245e-03, 6.3722e-03, 3.4360e-02, 5.9479e-03,
         1.1115e-02, 9.0537e-0

Verify that rows sum to one. The softmax function is not changing because the moment you apply dot product, it will remain constant throughout

In [22]:
attn_weights_norm.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)

In [23]:
#Total values sum to 1

### Now we need context aware embeddings: What are context aware embeddings?
### Embedding is able to read context behind each token
Context-aware embeddings (like those from BERT, GPT, RoBERTa) generate vectors for words or sentences depending on their surrounding context. So, “bank” in “river bank” and “bank” in “credit bank” have different embeddings.

Value vector helps us calculate the context aware embeddings

In [24]:
context_weighted_embeddings = torch.matmul(attn_weights_norm, embedded_tokens)
context_weighted_embeddings.shape

#creating a weighted sum of the input embeddings (which act as "values") based on the calculated attention weights. 
#This weighted sum forms the "context vector" or the output of the attention head for each query.

torch.Size([9, 32])

In [25]:
context_weighted_embeddings

tensor([[-1.0204e+00,  8.7015e-01, -2.4918e-01,  1.2211e+00,  1.0704e+00,
         -5.2672e-01,  7.7677e-03,  5.0431e-01, -3.7671e-01, -1.7926e+00,
         -8.8512e-01,  5.6510e-01,  5.3660e-01,  6.7693e-01,  3.1917e-01,
          1.2565e-01,  4.3635e-01,  4.1690e-01,  5.4701e-01, -8.3406e-02,
         -1.3436e-01,  6.1509e-01,  1.1197e+00,  5.8140e-01,  7.0009e-01,
          3.9966e-01,  1.4699e+00, -6.2345e-01,  2.0834e+00,  1.0477e-01,
         -6.8040e-01, -3.4457e-01],
        [-1.1314e+00, -3.1196e-01, -2.2292e+00, -7.5803e-04, -4.2823e-01,
         -1.1242e+00,  1.9543e+00,  1.2069e+00, -8.8895e-01, -5.9235e-02,
         -6.2384e-01,  5.9320e-01,  1.8282e-01,  1.0736e+00,  7.2972e-01,
          4.7152e-01,  9.4720e-01,  7.1531e-01, -1.1649e-01,  7.4002e-01,
         -7.3636e-01,  6.9609e-01,  5.0881e-01,  9.1078e-01, -2.2827e-01,
          6.1326e-01,  2.0561e-01, -1.0442e-01,  4.0297e-01, -1.8972e+00,
         -4.7123e-02, -1.1640e+00],
        [ 2.8390e-01,  1.6043e+00, -6.03

In [26]:
# embedded_tokens: are the value vectors (V).
#These are the actual embeddings of the input tokens (or features) from which information is being extracted. 
#Their shape is usually (batch_size, num_tokens, embedding_dim) (where num_tokens corresponds to num_keys).
#Each row represents the embedding vector of a particular token in the sequence.
#More the embedding size, it extracts more data features

In [27]:
context_weighted_embeddings_3 = (
    attn_weights_norm[3, 0] * embedded_tokens[0]
    + attn_weights_norm[3, 1] * embedded_tokens[1]
    + attn_weights_norm[3, 2] * embedded_tokens[2]
    + attn_weights_norm[3, 3] * embedded_tokens[3]
    + attn_weights_norm[3, 4] * embedded_tokens[4]
    + attn_weights_norm[3, 5] * embedded_tokens[5]
    + attn_weights_norm[3, 6] * embedded_tokens[6]
    + attn_weights_norm[3, 7] * embedded_tokens[7]
    + attn_weights_norm[3, 8] * embedded_tokens[8]
    )

context_weighted_embeddings_3

tensor([ 1.1398, -1.2633,  0.1315,  0.6854, -0.9446,  1.2194,  0.9699,  1.1646,
         0.4561,  0.8399,  0.3343,  0.8578,  0.5291, -1.8469,  0.3109, -0.6554,
         0.7770,  0.2768,  0.1007,  0.3230,  0.0934, -0.8105,  0.7124, -0.1673,
        -0.6124,  0.9902,  0.4982,  0.6943,  0.8789, -0.4606,  0.2754,  1.0717],
       grad_fn=<AddBackward0>)

In [28]:
#Third tokens weight printed here , similarly you can check for other

And verifying the output against the matrix multiplication computed above.

In [29]:
torch.allclose(context_weighted_embeddings_3, context_weighted_embeddings[3], atol=1e-6)

True

# Another Example

#### 1. Setup and Imports

First, let's import the necessary libraries.

In [30]:
import torch
import torch.nn.functional as F
import math

# For better display in notebooks
from IPython.display import display, Markdown

#### 2. Define Hyperparameters and Input Data

We'll use small, illustrative dimensions and a simple "sequence" of embeddings.

In [31]:
# Define embedding dimension (dk)
EMBEDDING_DIM = 4

# Define sequence length (number of tokens/words)
SEQUENCE_LENGTH = 5

# --- Simulate Input Embeddings (Our "Value" vectors initially) ---
# In a real model, these would come from an embedding layer
# or previous transformer block.
# Let's represent 5 tokens, each with an embedding of 4 dimensions.
# Shape: (sequence_length, embedding_dim)
embedded_tokens = torch.tensor([
    [0.1, 0.2, 0.3, 0.4], # Token 0: "The"
    [0.5, 0.6, 0.7, 0.8], # Token 1: "cat"
    [0.9, 0.0, 0.1, 0.2], # Token 2: "sat"
    [0.3, 0.4, 0.5, 0.6], # Token 3: "on"
    [0.7, 0.8, 0.9, 0.0]  # Token 4: "mat"
], dtype=torch.float32)

display(Markdown(f"**`embedded_tokens` (simulated input/value vectors) Shape:** `{embedded_tokens.shape}`"))
display(embedded_tokens)
print("\n---\n")

# --- Simulate Query and Key Matrices ---
# In self-attention, Q, K, V typically come from linear transformations
# of the same input embeddings. For simplicity, let's just make Q, K same as V for now.
# In a real scenario, you'd have:
# Q = embedded_tokens @ W_q
# K = embedded_tokens @ W_k
# V = embedded_tokens @ W_v
# where W_q, W_k, W_v are weight matrices.

**`embedded_tokens` (simulated input/value vectors) Shape:** `torch.Size([5, 4])`

tensor([[0.1000, 0.2000, 0.3000, 0.4000],
        [0.5000, 0.6000, 0.7000, 0.8000],
        [0.9000, 0.0000, 0.1000, 0.2000],
        [0.3000, 0.4000, 0.5000, 0.6000],
        [0.7000, 0.8000, 0.9000, 0.0000]])


---



In [32]:
# Continue in day 2 exercise , read theory in day 1 theory part

# CAUSAL MASKING

In a standard Transformer encoder's self-attention, each word can attend to all other words in the input sequence, both before and after it. This is great for understanding context in a static sentence.


However, imagine you're training a model to write a story, one word at a time. If, when predicting the third word, the model could "see" the actual fourth, fifth, or sixth words (which haven't been generated yet), it would be "cheating." It wouldn't be learning to predict genuinely based only on what has come before. This is a problem known as **data leakage or look-ahead bias.**


This isn't a problem if all we're doing is creating embeddings (or sequences) based on whole passages of text. It does pose a problem, however, if we're trying to develop a model that can generate new sequences given an initial sequence (or prompt).

- For tasks like text classification or summarization where you have the entire input available upfront, looking at future words is fine and even beneficial.

- But for tasks like predicting the next word in a sequence or machine translation decoding (generating the target sentence word by word), the model must only rely on information up to the current point. It must operate in an **autoregressive manner.**

### The Solution: Causal Masking

"This problem is solved by using causal masking."
Causal masking (also known as **look-ahead masking or masked self-attention)** is a technique applied specifically in the self-attention mechanism of the Transformer's decoder (and decoder-only models like GPT).

###### Causal masking matrices can be constructed to flag which attention weights should be set to zero so that causal relationships between embeddings aren't broken.


- During the calculation of attention scores, a causal mask is applied. This mask is typically an upper triangular matrix (including the diagonal) filled with very large negative numbers (like negative infinity) in the positions corresponding to "future" tokens.

- When softmax is applied to these masked scores, the terms become effectively zero. This forces the attention weights for future tokens to be zero, meaning the current token cannot attend to any subsequent tokens in the sequence.

**For example, when calculating the representation for the first word, it can only attend to itself. When calculating for the second word, it can attend to the first word and itself, but not the third, fourth, or fifth words, and so on.**

In [33]:
#Mask created in below format and then we will apply to the attention score

# CAUSAL MASKING

In [34]:
#This line of code specifically constructs the mask that implements the "causal" or "look-ahead" behavior
causal_mask = torch.triu(torch.full((n_tokens, n_tokens), True), diagonal=1)
causal_mask

#True means "apply the mask/block this information," 
#False means "do not mask/allow this information through."

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False,  True,  True,  True,  True,  True,  True],
        [False, False, False, False,  True,  True,  True,  True,  True],
        [False, False, False, False, False,  True,  True,  True,  True],
        [False, False, False, False, False, False,  True,  True,  True],
        [False, False, False, False, False, False, False,  True,  True],
        [False, False, False, False, False, False, False, False,  True],
        [False, False, False, False, False, False, False, False, False]])

Apply Mask to the attention score: This causal_mask is precisely the mechanism to prevent the "cheating" we talked about in sequence generation.

In the matrix above, True at (row, col) means that the token at row (as a query) is forbidden from attending to the token at col (as a key).

Since True appears only for respective col and row (i.e., tokens that come after the current query token), this mask ensures that:

When the attention mechanism calculates the context for token 0, it can only use information from token 0 itself (not 1, 2, 3).
When calculating for token 1, it can use information from token 0 and 1, but not 2 or 3.
And so on.

During the attention calculation, this boolean mask is typically used to set the attention scores for the "forbidden" (future) connections to a very large negative number. After softmax, these large negative numbers effectively become zero, meaning the model assigns no attention to future tokens. This enforces the autoregressive property, allowing the model to learn to predict sequences step-by-step based only on past information.



In [35]:
#This step directly implements the "causal" or "look-ahead" constraint. 
# For any token being processed, it prevents its attention mechanism from looking at or gaining information from 
#tokens that appear later in the sequence.

causal_attn_weights = attn_weights.masked_fill(causal_mask, -1e10)
causal_attn_weights[2]


#-1e10 is a very large negative number (e.g., -10,000,000,000). 
#It's used so that when this value goes through the softmax function, 
#it results in an output extremely close to zero, effectively "masking out" or nullifying the attention to that position. 
#This helps out to nullify the vanishing gradiant issue
#Learning rate improves the learning of model, causal masking nullifies the attention to future tokens 
#If you want the model to look everywhere, you make -1e10 very high

tensor([-1.3191e+00, -7.4185e+00,  2.2468e+01, -1.0000e+10, -1.0000e+10,
        -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
       grad_fn=<SelectBackward0>)

In [36]:
causal_attn_weights

tensor([[ 2.6134e+01, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10,
         -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [ 8.3032e+00,  2.9695e+01, -1.0000e+10, -1.0000e+10, -1.0000e+10,
         -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-1.3191e+00, -7.4185e+00,  2.2468e+01, -1.0000e+10, -1.0000e+10,
         -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-4.6427e+00, -3.4016e-01, -2.1691e-01,  2.5551e+01, -1.0000e+10,
         -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [ 6.6822e+00, -3.0784e+00, -2.1722e+00,  3.4426e+00,  2.9584e+01,
         -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [ 5.4403e+00,  3.4283e+00, -7.2561e-02,  4.8543e+00,  1.6087e+00,
          3.4103e+01, -1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-1.2678e+00,  4.1866e+00, -8.7021e+00,  7.5582e+00,  4.6315e+00,
          4.1000e+00,  2.4831e+01, -1.0000e+10, -1.0000e+10],
        [ 1.9402e+00,  3.2277e+00, -5.5650e+00, 

You have a big grid of numbers called attn_weights. These numbers tell the model how much to pay attention to each word when trying to understand the sentence.

You want to make sure the model only looks at words that came before or at the current word, not words that come after (because the future is unknown). So, you create a "mask" that marks the future words.

Then you use .masked_fill(causal_mask, -1e10) to replace those future words' attention numbers with a really, really tiny number (like negative one billion). This basically tells the model: "Ignore those future words."

The result is causal_attn_weights — the attention numbers but with future words blocked out.

When you write causal_attn_weights[2], you are saying: "Show me the data for the third item in this group." What this “third item” is depends on how your data is organized. It could be:

The third sentence in a batch of sentences,

The third attention head (a smaller attention machine inside the bigger one),

Or something else, depending on your setup.

In [37]:
#And apply scaling and normalisation as before.
causal_attn_weights_norm = F.softmax(
    causal_attn_weights / math.sqrt(EMBEDDING_DIM), dim=1
)
causal_attn_weights_norm.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)

In [38]:
#checked if sums to one

From this we prepare the context aware embeddings

In [45]:
causal_context_weighted_embeddings = torch.matmul(
    causal_attn_weights_norm, embedded_tokens
)
causal_context_weighted_embeddings.shape

torch.Size([9, 32])

In [40]:
#Generate Embeddings again just to align and convert input tokens into vectors --> This is run through neural network.embedding
# Vocabulary Size and Dimension of Embeddings --> you tokenize the sentence
EMBEDDING_DIM = 32

embedding_layer = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
embedded_tokens = embedding_layer(tokenized_sentence)
embedded_tokens.shape

#Torch.size(size of input data, embedding dimension)

torch.Size([9, 32])

In [46]:
#Check embeddings
causal_context_weighted_embeddings[2]

tensor([ 2.3650,  0.3154,  0.2994, -1.1914,  0.1285, -0.3733, -0.5181,  1.3512,
        -1.5681, -1.9776, -0.2771, -0.0908, -0.7488,  0.1672, -0.5036, -1.9478,
         0.6731, -0.4910,  1.1733, -0.4775,  1.0050,  1.2333, -2.0507, -0.4148,
         1.0885,  0.0528, -0.1399, -0.4349, -0.2876,  0.9835,  0.1909,  0.0569],
       grad_fn=<SelectBackward0>)

In [47]:
#Integrity of embeddings
causal_context_weighted_embeddings_3 = (
    causal_attn_weights_norm[3, 0] * embedded_tokens[0]
    + causal_attn_weights_norm[3, 1] * embedded_tokens[1]
    + causal_attn_weights_norm[3, 2] * embedded_tokens[2]
    + causal_attn_weights_norm[3, 3] * embedded_tokens[3]
)

torch.allclose(
    causal_context_weighted_embeddings_3, causal_context_weighted_embeddings[3]
)

True

In [48]:
causal_context_weighted_embeddings_3

tensor([ 0.9243, -0.9740, -0.3974,  0.6696,  1.0455, -1.7275,  0.6280,  0.9299,
         1.2461, -0.2323,  0.0031, -0.5088, -0.8515,  0.0597, -1.6898, -0.7558,
        -0.6331, -0.3503, -1.7318, -1.1988,  0.2001,  2.2014,  0.4385,  1.0417,
         1.3765, -0.1261, -0.2348, -1.1892,  0.3214,  0.8378, -1.0172,  0.0480],
       grad_fn=<AddBackward0>)

# Day 3 - Parameterized Self Attention

## Parametrised Self-Attention

Parametrised Self-Attention refers to a variation of the self-attention mechanism where additional learnable parameters are introduced into the attention computation. This allows the model to adaptively control how it distributes attention across input tokens.

![%7B81956077-0F8C-497A-BDC1-72200A189D94%7D.png](attachment:%7B81956077-0F8C-497A-BDC1-72200A189D94%7D.png)


##### What Makes Parametrised Self-Attention Different?

While standard self-attention already includes parameters (via the Q, K, V projections), the term **"parametrised self-attention" typically refers to further enhancements where:**

- Additional parameters are introduced into the attention score calculation or the softmax operation.

- Learnable biases, temperature scaling factors, or gating mechanisms are included.

- Multi-head attention components may be further parameterised with learned transformations or scaling terms.



### Queries, Keys and Values

In this setup, the values contain the information that we wish to access via a query that is made on a set of keys (that map to the values), such that the context-aware embeddings can now be computed as,

$$
\vec{z_{i}} = \sum_{j=1}^{N}{a_{ij} \times \vec{v_{j}}}
$$

Where, $a_{ij} = q_{i}^{T} \cdot k_{i}$ - i.e., the attention weights now represent the distance between the query and keys.

Very often we only have a single sequence to work with, so the model will have to learn how to infer the queries, keys and values from this. We can enable this level of plasticity by defining three  $N \times N$ weight matrices, $\textbf{U}_{q}$, $\textbf{U}_{k}$ and $\textbf{U}_{v}$.

In [49]:
#Take random Q,K,V tensor values
u_q = torch.rand(n_tokens, n_tokens)
u_k = torch.rand(n_tokens, n_tokens)
u_v = torch.rand(n_tokens, n_tokens)

In [50]:
u_q

tensor([[0.6565, 0.0888, 0.7548, 0.5358, 0.1745, 0.1289, 0.2809, 0.7323, 0.6916],
        [0.9928, 0.2934, 0.5413, 0.2614, 0.4556, 0.2404, 0.7967, 0.8775, 0.9719],
        [0.8782, 0.6002, 0.6184, 0.2610, 0.2850, 0.2901, 0.9675, 0.1707, 0.0308],
        [0.4414, 0.0191, 0.2553, 0.3147, 0.4735, 0.7354, 0.1285, 0.3538, 0.0082],
        [0.1946, 0.4983, 0.9141, 0.3868, 0.6972, 0.8244, 0.1683, 0.8110, 0.0437],
        [0.2777, 0.8364, 0.1628, 0.4824, 0.4077, 0.6239, 0.8953, 0.6745, 0.6595],
        [0.2188, 0.3821, 0.1226, 0.2618, 0.7807, 0.2419, 0.0597, 0.2273, 0.3878],
        [0.8476, 0.0118, 0.6823, 0.7973, 0.4422, 0.6797, 0.6160, 0.1221, 0.8538],
        [0.5712, 0.2830, 0.2578, 0.5347, 0.8326, 0.1532, 0.5764, 0.3492, 0.4376]])

From which we can define the query, keys and values as functions of x

In [53]:
# Create Metrices
q = torch.matmul(u_q, embedded_tokens)
k = torch.matmul(u_k, embedded_tokens)
v = torch.matmul(u_v, embedded_tokens)

q.shape == k.shape == v.shape == embedded_tokens.shape

True

We then recompute our parameterised attention weights using the same steps we used before.

In [55]:
#Compare with previoous one and check what different you have done
attn_weights_param = torch.empty(n_tokens, n_tokens)

for i in range(n_tokens):
    for j in range(n_tokens):
        attn_weights_param[i, j] = torch.dot(q[i], k[j])

attn_weights_param_norm = F.softmax(
    attn_weights_param / math.sqrt(EMBEDDING_DIM), dim=1
)
context_weighted_embeddings_param = torch.matmul(attn_weights_param_norm, v)

context_weighted_embeddings_param.shape

torch.Size([9, 32])

In [56]:
context_weighted_embeddings_param

tensor([[-2.9563e-01, -2.0936e+00, -1.7155e+00, -1.3937e+00, -3.0681e-01,
         -1.8992e+00,  2.0013e-01, -1.6290e+00,  9.8174e-01, -1.9686e+00,
         -1.4531e+00, -6.4580e-01, -2.5751e+00, -4.6332e-01, -1.8140e+00,
         -5.5802e-01, -3.3429e-01, -3.1449e+00, -1.0671e+00,  2.3382e+00,
          5.5324e+00,  1.4283e+00,  1.4317e+00,  1.5982e+00,  2.4086e+00,
         -3.6630e-01,  1.7945e+00, -5.6358e-01, -1.7490e+00,  2.1675e+00,
          1.0573e+00, -1.6814e-02],
        [-5.7416e-01, -2.0261e+00, -1.7455e+00, -1.3427e+00, -8.4654e-02,
         -1.8744e+00,  8.5526e-02, -1.9644e+00,  1.1028e+00, -1.5476e+00,
         -1.4959e+00, -3.4397e-01, -2.7230e+00, -6.2538e-01, -1.9278e+00,
         -5.1260e-01, -3.1722e-01, -2.8826e+00, -1.3861e+00,  2.3536e+00,
          5.7833e+00,  1.5104e+00,  1.5935e+00,  1.6391e+00,  2.2114e+00,
         -4.1484e-01,  1.8567e+00, -2.7050e-01, -1.6071e+00,  1.8552e+00,
          9.6431e-01,  2.3026e-02],
        [-4.0054e-01, -2.1304e+00, -1.76

In [59]:
#Verify if this makes sense
context_weighted_embeddings_param_3 = (
    attn_weights_param_norm[3, 0] * v[0]
    + attn_weights_param_norm[3, 1] * v[1]
    + attn_weights_param_norm[3, 2] * v[2]
    + attn_weights_param_norm[3, 3] * v[3]
    + attn_weights_param_norm[3, 4] * v[4]
    + attn_weights_param_norm[3, 5] * v[5]
    + attn_weights_param_norm[3, 6] * v[6]
    + attn_weights_param_norm[3, 7] * v[7]
    + attn_weights_param_norm[3, 8] * v[8]
    #+ attn_weights_param_norm[3, 9] * v[9]
)

torch.allclose(
    context_weighted_embeddings_param_3, context_weighted_embeddings_param[3]
)

True

### Multi-Head Attention


In what follows we demonstrate how use the parametrised attention mechanism sketched out above to develop the multi-head attention block that forms the foundation of all transformer architectures. Our aim here is purely didactic - the functions defined below won't yield anything you can train (refer to the fullcodebase in the `modelling` directory for this), but they do demonstrate how these algorithm are composed.

We start by encapsulating the parametrised attention mechanism within a single function.

In [62]:
def attention(
    query: torch.Tensor,
    keys: torch.Tensor,
    values: torch.Tensor,
    causal_masking: bool = False,
) -> torch.Tensor:
    """Compute single attention head."""
    n_tokens, embedding_dim = query.shape
    attn_weights = torch.matmul(query, keys.T) / math.sqrt(EMBEDDING_DIM)
    if causal_masking:
        mask = torch.triu(torch.full((n_tokens, n_tokens), True), diagonal=1)
        attn_weights = attn_weights.masked_fill(mask, -1e10)
    attn_weights_norm = attn_weights.softmax(dim=1)
    context_weighted_embeddings = torch.matmul(attn_weights_norm, values)
    return context_weighted_embeddings


attn_head_out = attention(q, k, v)
attn_head_out.shape

torch.Size([9, 32])

In [63]:
#First you defined a single head

In [73]:
def multi_head_attention(
    x_q: torch.Tensor,
    x_k: torch.Tensor,
    x_v: torch.Tensor,
    n_heads: int,
    causal_masking: bool = False,
) -> torch.Tensor:
    """Computing attention with multiple heads."""
    n_tokens, embedding_dim = embedded_tokens.shape
    
    u_q = torch.rand(n_heads, n_tokens, n_tokens)
    u_k = torch.rand(n_heads, n_tokens, n_tokens)
    u_v = torch.rand(n_heads, n_tokens, n_tokens)

    attn_head_outputs = torch.concat(
        [attention(u_q[h] @ x_q, u_k[h] @ x_k, u_v[h] @ x_v) for h in range(n_heads)],
        dim=1,
    )
    print("Number of heads:",n_heads)
    w_out = torch.rand(n_heads * embedding_dim, embedding_dim, requires_grad=True)
    return torch.matmul(attn_head_outputs, w_out)


multi_head_attn_out = multi_head_attention(
    embedded_tokens, embedded_tokens, embedded_tokens, n_heads=3
)
multi_head_attn_out.shape

Number of heads: 3


torch.Size([9, 32])

# Basic Transformer Architecture

## Encoder

In [75]:
def transformer_encoder_layer(
    src_embedding: torch.Tensor, n_heads: int, causal_masking: bool = False
) -> torch.Tensor:
    """Transformer encoder layer."""
    x = multi_head_attention(src_embedding, src_embedding, src_embedding, n_heads)
    x = F.layer_norm(x + src_embedding, x.shape)

    linear_1 = nn.Linear(EMBEDDING_DIM, 2 * EMBEDDING_DIM)
    linear_2 = nn.Linear(2 * EMBEDDING_DIM, EMBEDDING_DIM)

    x = x + F.relu(linear_2(linear_1(x)))

    return x


encoder_output = transformer_encoder_layer(embedded_tokens, n_heads=2)
encoder_output.shape

Number of heads: 2


torch.Size([9, 32])

In [76]:
encoder_output

tensor([[ 8.1203e-01,  1.4891e-01,  1.5372e-01,  1.7760e+00, -1.9895e-01,
         -1.0871e+00,  3.2016e-01,  1.5911e+00,  6.1307e-01, -9.1132e-02,
          1.2693e+00,  2.0609e+00, -1.6255e-01, -9.4107e-01,  8.0400e-01,
         -2.3842e-01, -1.4956e+00, -1.8692e-01,  8.5274e-01, -2.7596e-01,
         -2.3209e-02, -2.0179e-01, -6.9814e-01,  1.0676e+00, -2.7210e-02,
         -2.0933e-01, -2.1900e+00,  1.5520e+00, -3.0699e-01, -2.1589e+00,
          2.8190e-01,  1.1620e+00],
        [ 7.8840e-01,  5.0948e-01,  1.2440e+00,  2.0739e+00,  7.9884e-01,
         -1.1265e+00,  7.4645e-01,  1.9345e+00,  1.2934e+00,  4.0042e-02,
          1.5439e+00,  2.4587e+00,  5.3878e-01, -2.5842e-01,  6.7901e-01,
          1.3712e-01, -7.1563e-01, -8.1934e-01, -6.8352e-02,  8.6518e-01,
          5.2931e-01,  4.3073e-01, -4.7354e-01,  1.2107e+00,  9.2599e-01,
         -4.2979e-01, -1.2033e+00,  1.6822e+00, -4.9745e-01, -1.7844e+00,
          1.0754e+00,  1.7102e+00],
        [ 1.0547e+00, -4.1880e-01,  1.83

In [77]:
def transformer_decoder_layer(
    src_embedding: torch.Tensor,
    target_embedding: torch.Tensor,
    n_heads: int,
    causal_masking: bool = False,
) -> torch.Tensor:
    """Transformer decoder layer."""
    x = multi_head_attention(
        target_embedding, target_embedding, target_embedding, n_heads
    )
    x = F.layer_norm(x + target_embedding, x.shape)
    x = x + multi_head_attention(src_embedding, src_embedding, x, n_heads)
    x = F.layer_norm(x, x.shape)

    linear_1 = nn.Linear(EMBEDDING_DIM, 2 * EMBEDDING_DIM)
    linear_2 = nn.Linear(2 * EMBEDDING_DIM, EMBEDDING_DIM)

    x = x + F.relu(linear_2(linear_1(x)))

    return x


decoder_output = transformer_decoder_layer(embedded_tokens, embedded_tokens, n_heads=2)
decoder_output.shape

Number of heads: 2
Number of heads: 2


torch.Size([9, 32])