In [1]:
import torch
import torch.nn as nn

In [2]:
text = "The cat sat quietly"

In [3]:
tokens = text.split()
vocab = {word: idx for idx, word in enumerate(tokens)}
print("Vocab:", vocab)

Vocab: {'The': 0, 'cat': 1, 'sat': 2, 'quietly': 3}


In [4]:
token_ids = [vocab[token] for token in tokens]
print("Token IDs:", token_ids)

Token IDs: [0, 1, 2, 3]


In [5]:
#Creating an embedding layer
vocab_size = len(vocab)
embedding_dim = 3
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
input_tensor = torch.tensor(token_ids)
embedded = embedding(input_tensor)
print("Embeddings:\n", embedded)

Embeddings:
 tensor([[-0.7964, -0.6372,  0.8634],
        [ 0.0595,  0.3991,  0.5173],
        [-0.9806, -0.2280,  0.8676],
        [-0.6734,  1.8603,  0.9321]], grad_fn=<EmbeddingBackward0>)


In [9]:
input_embeddings= torch.stack([embedded[i] for i in token_ids])

In [10]:
input_embeddings

tensor([[-0.7964, -0.6372,  0.8634],
        [ 0.0595,  0.3991,  0.5173],
        [-0.9806, -0.2280,  0.8676],
        [-0.6734,  1.8603,  0.9321]], grad_fn=<StackBackward0>)

In [11]:
# we choose output_dim = 2 for Q, K, V
W_q = torch.rand((3, 2))
W_k = torch.rand((3, 2))
W_v = torch.rand((3, 2))

Q = input_embeddings @ W_q
K = input_embeddings @ W_k
V = input_embeddings @ W_v

In [12]:
# Let's take "cat", which is at index 1
query_index = 1
q_cat = Q[query_index]          
dot_products = torch.matmul(K, q_cat)  # Dotproduct with all Keys
print("Dot Products:", dot_products)

Dot Products: tensor([-0.0420,  0.8301,  0.2412,  2.1782], grad_fn=<MvBackward0>)


In [14]:
import math
d_k = Q.size(-1)  # size of each query/key vector = 2
scaled_dots = dot_products / math.sqrt(d_k)
print("Scaled Dot Products:", scaled_dots)

Scaled Dot Products: tensor([-0.0297,  0.5869,  0.1705,  1.5402], grad_fn=<DivBackward0>)


In [15]:
import torch.nn.functional as F 
attention_weights = F.softmax(scaled_dots, dim=0)
print("Attention Weights:", attention_weights)

Attention Weights: tensor([0.1126, 0.2086, 0.1376, 0.5412], grad_fn=<SoftmaxBackward0>)


In [None]:
# Step 1: Dot product of Q and K^T
dot_products_all = torch.matmul(Q, K.T)  # shape: (4, 4)

# Step 2: Scale the dot products
dk = Q.shape[-1]
scaled_dot_products = dot_products_all / math.sqrt(dk)  # shape: (4, 4)

# Step 3: Apply softmax to get attention weights
attention_weights_all = F.softmax(scaled_dot_products, dim=1)  # shape: (4, 4)

# Step 4: Multiply attention weights with V to get attention output
# attention_output[i] = sum_j(attention_weight[i][j] * V[j])
attention_output = torch.matmul(attention_weights_all, V)  # shape: (4, 2)

# Print each token's final output representation
print("Attention Output Vectors:")
for i, token in enumerate(tokens):
    print(f"{token}: {attention_output[i].tolist()}")

In [16]:
# Step 5: Compute weighted sum of value vectors
attention_output_cat = torch.sum(attention_weights.unsqueeze(1) * V, dim=0)

In [17]:
attention_output_cat

tensor([-0.2068,  1.6595], grad_fn=<SumBackward1>)

In [18]:
print("Attention Weights for 'cat':", attention_weights.tolist())
print("Final Attention Output for 'cat':", attention_output_cat.tolist())

Attention Weights for 'cat': [0.1126038059592247, 0.20862381160259247, 0.1375701129436493, 0.5412022471427917]
Final Attention Output for 'cat': [-0.2068280726671219, 1.6594971418380737]
