# Lab Exercise: Computing Attention Weights and Validating Matrix Multiplication


# Setup
Assume you have n_tokens tokens and each token is embedded into a vector of dimension embedding_dim.


You have a tensor embedded_tokens of shape (n_tokens, embedding_dim) containing these embeddings.

# Task 1: Initialize an empty tensor for attention weights
Use torch.empty() to create an empty tensor of shape (n_tokens, n_tokens) called attn_weights.



# Task 2: Compute attention weights using nested loops
Use two for loops over the indices i and j ranging from 0 to n_tokens - 1.

For each pair (i, j), compute the dot product between the embedding vectors embedded_tokens[i] and embedded_tokens[j].

Store the result in attn_weights[i, j].



# Task 3: Compute attention weights using matrix multiplication
Use the matrix multiplication operator to compute attn_weights_matmul as the product of embedded_tokens and the transpose of embedded_tokens.



# Task 4: Verify that both methods give nearly the same results
Use torch.allclose() to compare attn_weights and attn_weights_matmul within a tolerance (e.g., atol=1e-6).



# Print whether the two matrices are approximately equal or not.

In [1]:
#Setup 

import math

#we use pytorch when parameters require complex cmputations. Helps in neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
#Give an input data
sentence = "My name is Sajag mathur. He likes classical music."     

tensor([[9.7444e-27, 1.6774e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+0

In [5]:
def tokenize(text: str, vocab_size: int) -> torch.Tensor:
    """Dummy text tokenizer."""
    #break into chunks
    words = text.split(" ")
    #give integer value to each chunk - this is for testing. In case you want consistant values, you replace by huggingface transformer and integer values
    return torch.randint(0, vocab_size, [len(words)])

#give a hyperparameter called vocab_size --> this means that total number of unique tokens that model can recognize and process is 20000.
VOCAB_SIZE = 20000

tokenized_sentence = tokenize(sentence, VOCAB_SIZE)
n_tokens = len(tokenized_sentence)
tokenized_sentence

tensor([ 1249, 19363, 15081,  9622,  4490, 13983,  3181,   955, 17675])

In [6]:
#Generate Embeddings and convert input tokens into vectors --> This is run through neural network.embedding
# Vocabulary Size and Dimension of Embeddings --> you tokenize the sentence
EMBEDDING_DIM = 32

embedding_layer = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
embedded_tokens = embedding_layer(tokenized_sentence)
embedded_tokens.shape

#Torch.size(size of input data, embedding dimension)

torch.Size([9, 32])

In [7]:
#for all tokens --> attention weight is dot product of each embedding token against any other embedding token

attn_weights = torch.empty(n_tokens, n_tokens)


for i in range(n_tokens):
    for j in range(n_tokens):
        attn_weights[i, j] = torch.dot(embedded_tokens[i], embedded_tokens[j])

attn_weights.shape

torch.Size([9, 9])

In [8]:
attn_weights

tensor([[ 3.4985e+01,  1.2610e+01,  2.7600e+00, -2.5685e-01, -2.1084e+00,
          2.9533e+00, -3.7966e+00,  5.4972e+00,  5.6959e+00],
        [ 1.2610e+01,  3.7651e+01,  2.9662e+00, -8.0765e-01, -9.3061e+00,
         -2.9145e+00,  4.3191e+00, -1.8725e+00,  1.0776e+01],
        [ 2.7600e+00,  2.9662e+00,  1.7985e+01,  2.0308e+00, -8.3572e+00,
         -7.7152e-01, -1.1566e+00,  8.1203e+00,  5.8534e-01],
        [-2.5685e-01, -8.0765e-01,  2.0308e+00,  2.7836e+01,  3.3133e-01,
         -6.4917e+00,  4.7592e+00,  7.7281e-01,  3.0164e+00],
        [-2.1084e+00, -9.3061e+00, -8.3572e+00,  3.3133e-01,  3.2199e+01,
         -6.0480e+00,  1.8887e+00, -6.9616e+00,  5.9405e+00],
        [ 2.9533e+00, -2.9145e+00, -7.7152e-01, -6.4917e+00, -6.0480e+00,
          2.3370e+01, -4.6410e+00, -5.3984e+00, -3.2613e-02],
        [-3.7966e+00,  4.3191e+00, -1.1566e+00,  4.7592e+00,  1.8887e+00,
         -4.6410e+00,  3.7193e+01, -5.4652e+00,  9.4490e+00],
        [ 5.4972e+00, -1.8725e+00,  8.1203e+00, 

In [9]:
attn_weights_matmul = torch.matmul(embedded_tokens, embedded_tokens.T)

In [10]:
attn_weights_matmul

tensor([[ 3.4985e+01,  1.2610e+01,  2.7600e+00, -2.5685e-01, -2.1084e+00,
          2.9533e+00, -3.7966e+00,  5.4972e+00,  5.6959e+00],
        [ 1.2610e+01,  3.7651e+01,  2.9662e+00, -8.0765e-01, -9.3061e+00,
         -2.9145e+00,  4.3191e+00, -1.8725e+00,  1.0776e+01],
        [ 2.7600e+00,  2.9662e+00,  1.7985e+01,  2.0308e+00, -8.3572e+00,
         -7.7152e-01, -1.1566e+00,  8.1203e+00,  5.8534e-01],
        [-2.5685e-01, -8.0765e-01,  2.0308e+00,  2.7836e+01,  3.3133e-01,
         -6.4917e+00,  4.7592e+00,  7.7281e-01,  3.0164e+00],
        [-2.1084e+00, -9.3061e+00, -8.3572e+00,  3.3133e-01,  3.2199e+01,
         -6.0480e+00,  1.8887e+00, -6.9616e+00,  5.9405e+00],
        [ 2.9533e+00, -2.9145e+00, -7.7152e-01, -6.4917e+00, -6.0480e+00,
          2.3370e+01, -4.6410e+00, -5.3984e+00, -3.2613e-02],
        [-3.7966e+00,  4.3191e+00, -1.1566e+00,  4.7592e+00,  1.8887e+00,
         -4.6410e+00,  3.7193e+01, -5.4652e+00,  9.4490e+00],
        [ 5.4972e+00, -1.8725e+00,  8.1203e+00, 

In [11]:
#check if both approaches are same or not
torch.allclose(attn_weights_matmul, attn_weights, atol=1e-6)

True