Attention Exercise as prescribed by the video https://www.youtube.com/watch?v=wdlp4Sg01Mg&list=WL&index=1&t=120s

In [7]:
# -*- coding: utf-8 -*-
"""
Solve This Challenge to Understand Self-Attention! (Transformers Explained)

This file provides a hands-on challenge to implement a basic self-attention
mechanism step-by-step using PyTorch.

Self-attention allows a model to weigh the importance of different words (or tokens)
in an input sequence when processing a specific word. It looks at other words
in the sequence to get a better representation of the current word.

Follow the comments and fill in the missing code sections marked with # FILL THIS IN.
Run the script after filling each part to check your progress! Good luck!
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

print(f"Using PyTorch version: {torch.__version__}")

Using PyTorch version: 2.0.0+cu118


In [8]:
# --- Configuration ---
# Let's define some dimensions for our example.
# We'll use a small sequence and embedding size for clarity.

batch_size = 1  # How many sequences we process at once (keep at 1 for simplicity)
seq_len = 4     # The length of our input sequence (e.g., 4 words)
embed_dim = 8  # The dimension of each word embedding vector

# For simplicity in this basic example, we'll make the dimensions for
# Query, Key, and Value vectors the same as the embedding dimension.
# In full transformer models, these can differ (especially with multi-head attention).
d_k = embed_dim # Dimension of Key and Query vectors
d_v = embed_dim # Dimension of Value vectors

print(f"\n--- Setup ---")
print(f"Batch Size (N): {batch_size}")
print(f"Sequence Length (L): {seq_len}")
print(f"Embedding Dimension (E = D_model): {embed_dim}")
print(f"Key/Query Dimension (D_k): {d_k}")
print(f"Value Dimension (D_v): {d_v}")


--- Setup ---
Batch Size (N): 1
Sequence Length (L): 4
Embedding Dimension (E = D_model): 8
Key/Query Dimension (D_k): 8
Value Dimension (D_v): 8


In [9]:
# --- Input Data ---
# Let's create some random input data representing word embeddings.
# Shape: (batch_size, seq_len, embed_dim) or (N, L, E)
print("\n--- Input ---")
x = torch.randn(batch_size, seq_len, embed_dim)
print(f"Input tensor 'x' shape: {x.shape}")
print(f"Input tensor\n : {x}")




--- Input ---
Input tensor 'x' shape: torch.Size([1, 4, 8])
Input tensor
 : tensor([[[ 0.8626, -0.8381, -1.0351, -0.3404,  0.1873,  0.1551, -0.3786,
          -0.4157],
         [-0.4404, -0.9564,  1.3785,  0.6161,  0.5635,  2.6217,  1.0450,
          -2.1747],
         [-0.5136,  0.4162, -0.6192,  1.4304, -1.9323, -0.5244, -1.2391,
          -0.1437],
         [ 1.6148, -0.1665, -1.5250,  1.1092,  1.9637, -0.5266, -0.0132,
          -0.4935]]])


In [10]:
# --- Step 1: Linear Projections for Q, K, V ---
# In self-attention, we project the input embeddings into three different spaces:
# Query (Q): Represents the current word asking for information.
# Key (K): Represents all words' potential relevance (as keys to be queried).
# Value (V): Represents the actual content/meaning of all words.
# We use learnable linear layers (weight matrices) for these projections.

W_q = nn.Linear(embed_dim, d_k, bias=False) # Query weight matrix
W_k = nn.Linear(embed_dim, d_k, bias=False) # Key weight matrix
W_v = nn.Linear(embed_dim, d_v, bias=False) # Value weight matrix

In [11]:
print("\n--- Challenge 1: Calculate Q, K, V ---")
# ---- CHALLENGE 1 START ----
# Your task: Apply the linear layers (W_q, W_k, W_v) to the input 'x'
# to get the Query, Key, and Value matrices.
# The input 'x' has shape (N, L, E).
# Q, K should have shape (N, L, D_k).
# V should have shape (N, L, D_v).

# FILL THIS IN: Calculate Q by passing x through W_q
Q = W_q(x) # Replace None with your calculation
# FILL THIS IN: Calculate K by passing x through W_k
K = W_k(x) # Replace None with your calculation
# FILL THIS IN: Calculate V by passing x through W_v
V = W_v(x) # Replace None with your calculation

# ---- CHALLENGE 1 END ----

# Let's check if Q, K, V were calculated
assert Q is not None, "Challenge 1 incomplete: Q is not calculated."
assert K is not None, "Challenge 1 incomplete: K is not calculated."
assert V is not None, "Challenge 1 incomplete: V is not calculated."
# Check the shapes (should match the comments above)
expected_q_k_shape = (batch_size, seq_len, d_k)
expected_v_shape = (batch_size, seq_len, d_v)
assert Q.shape == expected_q_k_shape, f"Q shape is {Q.shape}, expected {expected_q_k_shape}"
assert K.shape == expected_q_k_shape, f"K shape is {K.shape}, expected {expected_q_k_shape}"
assert V.shape == expected_v_shape, f"V shape is {V.shape}, expected {expected_v_shape}"
print(f"Q shape: {Q.shape} - Correct!")
print(f"K shape: {K.shape} - Correct!")
print(f"V shape: {V.shape} - Correct!")
print("Challenge 1 Completed Successfully!")


--- Challenge 1: Calculate Q, K, V ---
Q shape: torch.Size([1, 4, 8]) - Correct!
K shape: torch.Size([1, 4, 8]) - Correct!
V shape: torch.Size([1, 4, 8]) - Correct!
Challenge 1 Completed Successfully!


In [12]:
# --- Step 2: Calculate Raw Attention Scores ---
# The core idea: How much should each word (represented by Q) pay attention
# to every other word (represented by K)?
# We calculate this using the dot product between each Query vector and all Key vectors.
# Formula part: QK^T

print("\n--- Challenge 2: Calculate Raw Attention Scores (QK^T) ---")
# ---- CHALLENGE 2 START ----
# Your task: Calculate the raw attention scores by performing a matrix multiplication
# between the Query (Q) and the transpose of the Key (K).
# Q shape: (N, L, D_k)
# K shape: (N, L, D_k) -> K transposed shape: (N, D_k, L)
# The result should have shape (N, L, L), representing the scores
# for each query position attending to each key position.
# Hint: Use torch.matmul() and transpose K correctly (the last two dimensions).

# FILL THIS IN: Calculate Q * K^T
attention_scores_raw = Q @ K.transpose(1,2) # Replace None with your calculation

# ---- CHALLENGE 2 END ----

assert attention_scores_raw is not None, "Challenge 2 incomplete: attention_scores_raw is not calculated."
expected_scores_shape = (batch_size, seq_len, seq_len)
assert attention_scores_raw.shape == expected_scores_shape, f"Raw Attention Scores shape is {attention_scores_raw.shape}, expected {expected_scores_shape}"
print(f"Raw Attention Scores shape: {attention_scores_raw.shape} - Correct!")
# print(f"Raw Scores (example):\n{attention_scores_raw.detach()}") # Optional: uncomment to view scores
print("Challenge 2 Completed Successfully!")


--- Challenge 2: Calculate Raw Attention Scores (QK^T) ---
Raw Attention Scores shape: torch.Size([1, 4, 4]) - Correct!
Challenge 2 Completed Successfully!


In [13]:
# --- Step 3: Scale the Scores ---
# To stabilize gradients during training, the scores are scaled down.
# We divide by the square root of the Key/Query dimension (d_k).
# Formula part: QK^T / sqrt(d_k)

print("\n--- Challenge 3: Scale the Scores ---")
# ---- CHALLENGE 3 START ----
# Your task: Scale the `attention_scores_raw` by dividing them by the
# square root of d_k.
# Hint: Use math.sqrt()

# FILL THIS IN: Calculate the scale factor
scale_factor = K.shape[-1]**0.5 # Replace None with your calculation
# FILL THIS IN: Divide the raw scores by the scale factor
attention_scores_scaled = attention_scores_raw/scale_factor # Replace None with your calculation

# ---- CHALLENGE 3 END ----

assert scale_factor is not None, "Challenge 3 incomplete: scale_factor is not calculated."
assert attention_scores_scaled is not None, "Challenge 3 incomplete: attention_scores_scaled is not calculated."
assert attention_scores_scaled.shape == expected_scores_shape, f"Scaled Attention Scores shape is {attention_scores_scaled.shape}, expected {expected_scores_shape}"
print(f"Scale factor (sqrt(d_k)): {scale_factor:.2f}")
print(f"Scaled Attention Scores shape: {attention_scores_scaled.shape} - Correct!")
# print(f"Scaled Scores (example):\n{attention_scores_scaled.detach()}") # Optional: uncomment to view scores
print("Challenge 3 Completed Successfully!")



--- Challenge 3: Scale the Scores ---
Scale factor (sqrt(d_k)): 2.83
Scaled Attention Scores shape: torch.Size([1, 4, 4]) - Correct!
Challenge 3 Completed Successfully!


In [14]:
# --- Step 4: Apply Softmax ---
# Convert the scaled scores into probability distributions (attention weights).
# For each query position (each row in the L x L matrix), the weights across
# all key positions (columns) should sum to 1.
# This tells us *how much* attention each query should pay to each key.
# Formula part: softmax(QK^T / sqrt(d_k))

print("\n--- Challenge 4: Apply Softmax ---")
# ---- CHALLENGE 4 START ----
# Your task: Apply the softmax function to the `attention_scores_scaled`.
# Crucially, apply softmax along the *last* dimension (dim=-1). This ensures
# that for each query (row), the weights distributed across the keys (columns) sum to 1.
# Hint: Use F.softmax() or torch.softmax()

# FILL THIS IN: Apply softmax to the scaled scores along the last dimension
attention_weights = torch.softmax(attention_scores_scaled,dim=-1) # Replace None with your calculation

# ---- CHALLENGE 4 END ----

assert attention_weights is not None, "Challenge 4 incomplete: attention_weights is not calculated."
assert attention_weights.shape == expected_scores_shape, f"Attention Weights shape is {attention_weights.shape}, expected {expected_scores_shape}"
print(f"Attention Weights shape: {attention_weights.shape} - Correct!")
# Check if weights sum to 1 for the first query position (should be close to 1.0)
sum_check = attention_weights[0, 0, :].sum().item()
assert math.isclose(sum_check, 1.0, rel_tol=1e-6), f"Weights for first query sum to {sum_check}, expected ~1.0"
print(f"Sum of weights for first query: {sum_check:.4f} - Correct!")
# print(f"Attention Weights (example):\n{attention_weights.detach()}") # Optional: uncomment to view weights
print("Challenge 4 Completed Successfully!")


--- Challenge 4: Apply Softmax ---
Attention Weights shape: torch.Size([1, 4, 4]) - Correct!
Sum of weights for first query: 1.0000 - Correct!
Challenge 4 Completed Successfully!


In [15]:
# --- Step 5: Multiply Weights by Values ---
# Now we have the attention weights (how much to focus on each word).
# We multiply these weights by the Value (V) vectors.
# This effectively creates a weighted sum of the Value vectors, where words
# deemed more important (higher attention weight) contribute more to the final output.
# Formula part: softmax(QK^T / sqrt(d_k)) * V

print("\n--- Challenge 5: Multiply Weights by Values ---")
# ---- CHALLENGE 5 START ----
# Your task: Calculate the final output of the self-attention layer.
# Perform a matrix multiplication between the `attention_weights` and the Value matrix (V).
# attention_weights shape: (N, L, L)
# V shape: (N, L, D_v)
# The result should be the final context-aware output embeddings, with shape (N, L, D_v).
# Hint: Use torch.matmul()

# FILL THIS IN: Calculate Weights * V
output = attention_weights @ V # Replace None with your calculation

# ---- CHALLENGE 5 END ----

assert output is not None, "Challenge 5 incomplete: output is not calculated."
assert output.shape == expected_v_shape, f"Final Output shape is {output.shape}, expected {expected_v_shape}"
print(f"Final Output shape: {output.shape} - Correct!")
print("Challenge 5 Completed Successfully!")


--- Challenge 5: Multiply Weights by Values ---
Final Output shape: torch.Size([1, 4, 8]) - Correct!
Challenge 5 Completed Successfully!


In [16]:
# --- Final Output ---
# The `output` tensor now contains the new representations for each input token.
# Each vector in the output sequence (e.g., output[0, i, :]) incorporates information
# from the entire input sequence, weighted by the calculated attention scores.
# This output can then be passed to subsequent layers in a Transformer model.

print("\n--- Attention Output ---")
# We need to make sure output was calculated before trying to print it
if output is not None:
    print(f"Output tensor (example first vector):\n{output[0, 0, :].detach()}") # Use detach() if not training
else:
    print("Final output not calculated yet.")

print("\nCongratulations! If you filled everything correctly and saw 'Completed Successfully!' messages,")
print("you have implemented a basic self-attention mechanism!")
print("This output represents the input sequence where each token's representation")
print("has been updated based on its relevance to other tokens in the sequence.")


--- Attention Output ---
Output tensor (example first vector):
tensor([-0.2372, -0.2045,  0.1046,  0.1264,  0.1372, -0.6009, -0.0637,  0.2800])

Congratulations! If you filled everything correctly and saw 'Completed Successfully!' messages,
you have implemented a basic self-attention mechanism!
This output represents the input sequence where each token's representation
has been updated based on its relevance to other tokens in the sequence.
