## self-attention mechanism in the context of a transformer model

In [3]:
import numpy as np
import torch
import torch.nn.functional as F

# Set the dimensions
d_k = 64  # Dimension of queries and keys
d_v = 64  # Dimension of values
seq_len = 10  # Length of input sequence
embedding_dim = 64  # Dimension of input embeddings

# Random input embeddings (for demonstration)
x = torch.rand(seq_len, embedding_dim)

# Weights for query, key, and value projections
W_q = torch.rand(embedding_dim, d_k)
W_k = torch.rand(embedding_dim, d_k)
W_v = torch.rand(embedding_dim, d_v)

# Step 1: Compute Queries, Keys, and Values
queries = x @ W_q  # Shape: (seq_len, d_k)
keys = x @ W_k     # Shape: (seq_len, d_k)
values = x @ W_v   # Shape: (seq_len, d_v)

# Step 2: Calculate Scaled Dot-Product Attention
# Compute attention scores by taking the dot product between queries and keys
scores = queries @ keys.T  # Shape: (seq_len, seq_len)
scaled_scores = scores / np.sqrt(d_k)  # Scale by square root of d_k for stability

# Apply softmax to get the attention weights
attention_weights = F.softmax(scaled_scores, dim=-1)  # Shape: (seq_len, seq_len)

# Step 3: Compute the output as a weighted sum of values
output = attention_weights @ values  # Shape: (seq_len, d_v)

# Display the shapes of intermediate outputs for clarity
print("Queries shape:", queries.shape)
print("Keys shape:", keys.shape)
print("Values shape:", values.shape)
print("Attention scores shape:", scaled_scores.shape)
print("Attention weights shape:", attention_weights.shape)
print("Output shape:", output.shape)


Queries shape: torch.Size([10, 64])
Keys shape: torch.Size([10, 64])
Values shape: torch.Size([10, 64])
Attention scores shape: torch.Size([10, 10])
Attention weights shape: torch.Size([10, 10])
Output shape: torch.Size([10, 64])


## Self-Attention on a Simple Sentence
This code will print:
<BR>The initial word embeddings.
<BR>The Queries, Keys, and Values for each word.
<BR>The attention weights (how much each word is focusing on every other word).
<BR>The final output embeddings (contextualized representations) for each word in the sentence.

In [2]:
import numpy as np
import torch
import torch.nn.functional as F

# 1. Sentence and Word Embeddings
# Let's take a simple sentence: "The cat chased the mouse"
# We'll create embeddings (just random vectors for simplicity) for each word.

words = ["The", "cat", "chased", "the", "mouse"]
embedding_dim = 4  # Dimension of each word embedding (usually much higher, but simplified here)

# Create random embeddings for each word
# Embeddings are numerical representations of objects like text, images, and audio that are created by ML models.

np.random.seed(0)
embeddings = {word: torch.tensor(np.random.rand(embedding_dim), dtype=torch.float32) for word in words}
print("Word Embeddings:")
for word, emb in embeddings.items():
    print(f"{word}: {emb}")

# Stack embeddings to form input sequence
x = torch.stack([embeddings[word] for word in words])  # Shape: (5, embedding_dim)

# 2. Project to Queries, Keys, and Values
# Random weights to transform embeddings into queries, keys, and values
d_k = d_v = embedding_dim  # For simplicity, let queries, keys, and values have the same dimension

W_q = torch.rand(embedding_dim, d_k)
W_k = torch.rand(embedding_dim, d_k)
W_v = torch.rand(embedding_dim, d_v)

# Apply linear transformations to obtain queries, keys, and values
queries = x @ W_q
keys = x @ W_k
values = x @ W_v

print("\nQueries, Keys, and Values:")
for i, word in enumerate(words):
    print(f"{word} - Query: {queries[i]}, Key: {keys[i]}, Value: {values[i]}")

# 3. Calculate Attention Scores
# Compute the dot product between queries and keys for each pair of words
# This creates a "focus" score for each word with every other word
scores = queries @ keys.T  # Shape: (5, 5), comparing each query with each key

# Scale the scores by the square root of d_k for stability
scaled_scores = scores / np.sqrt(d_k)

# 4. Apply Softmax to Get Attention Weights
# The softmax function converts scores into probabilities (focus weights)
attention_weights = F.softmax(scaled_scores, dim=-1)  # Shape: (5, 5)

print("\nAttention Weights (focus on each word):")
for i, word in enumerate(words):
    print(f"{word}: {attention_weights[i]}")

# 5. Calculate the Output as a Weighted Sum of Values
# Each word's output embedding is a combination of value vectors, weighted by attention weights
output = attention_weights @ values  # Shape: (5, d_v)

print("\nOutput Embeddings (contextualized representation of each word):")
for i, word in enumerate(words):
    print(f"{word}: {output[i]}")


Word Embeddings:
The: tensor([0.5488, 0.7152, 0.6028, 0.5449])
cat: tensor([0.4237, 0.6459, 0.4376, 0.8918])
chased: tensor([0.9637, 0.3834, 0.7917, 0.5289])
the: tensor([0.5680, 0.9256, 0.0710, 0.0871])
mouse: tensor([0.0202, 0.8326, 0.7782, 0.8700])

Queries, Keys, and Values:
The - Query: tensor([1.5508, 1.0782, 0.9614, 1.2697]), Key: tensor([1.4139, 0.7521, 0.7219, 0.9764]), Value: tensor([1.2509, 0.6133, 0.6430, 1.2622])
cat - Query: tensor([1.5947, 0.9295, 1.1165, 1.4145]), Key: tensor([1.3193, 0.6156, 0.5840, 1.1054]), Value: tensor([1.0767, 0.6502, 0.6013, 1.0473])
chased - Query: tensor([1.8072, 1.1587, 1.0088, 1.0838]), Key: tensor([1.4285, 0.9095, 0.9033, 0.8424]), Value: tensor([1.1663, 0.6591, 0.8125, 1.5585])
the - Query: tensor([0.7328, 1.1613, 0.3807, 1.1279]), Key: tensor([1.0649, 0.2019, 0.4302, 0.8388]), Value: tensor([1.0087, 0.4440, 0.2790, 0.8554])
mouse - Query: tensor([1.7788, 0.8101, 1.2817, 1.4500]), Key: tensor([1.5394, 0.9606, 0.6920, 1.1284]), Value: tensor