In [1]:
import torch

In [2]:
torch.__version__

'2.7.0'

In [3]:
# The input sequence has already been embeded into 3D vectors.
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

## Attention Scores and Weights for a Single Query

In [4]:
query = inputs[1]  # 2nd input token is the query
query, query.shape, inputs.shape

(tensor([0.5500, 0.8700, 0.6600]), torch.Size([3]), torch.Size([6, 3]))

In [6]:
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs): # iterates through the rows.
    # dot product (transpose not necessary here since they are 1-dim vectors)
    print(f'{x_i=}')
    attn_scores_2[i] = torch.dot(x_i, query)

attn_scores_2

x_i=tensor([0.4300, 0.1500, 0.8900])
x_i=tensor([0.5500, 0.8700, 0.6600])
x_i=tensor([0.5700, 0.8500, 0.6400])
x_i=tensor([0.2200, 0.5800, 0.3300])
x_i=tensor([0.7700, 0.2500, 0.1000])
x_i=tensor([0.0500, 0.8000, 0.5500])


tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [9]:
# Same but with the transpose in the right place just to get into the habit.
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i.T, query)

attn_scores_2

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [11]:
# Same but in one-go.
inputs @ query  # 6x3 x 3x1

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [12]:
# Obtain attention weights by normalizing attention scores.
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
attn_weights_2_tmp, attn_weights_2_tmp.sum()

(tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]), tensor(1.0000))

In [9]:
# Softmax is better at handling extreme values and has more desirable gradient properties during training.
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
attn_weights_2, attn_weights_2.sum()

(tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]), tensor(1.))

In [10]:
# The efficient way of doing it.
_attn_scores_2 = inputs @ query
torch.softmax( _attn_scores_2, dim=0 ) 

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])

## A single Context Vector

In [11]:
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

context_vec_2

tensor([0.4419, 0.6515, 0.5683])

In [12]:
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

context_vec_2

tensor([0.4419, 0.6515, 0.5683])

In [13]:
attn_weights_2.shape, inputs[0].shape, inputs.shape

(torch.Size([6]), torch.Size([3]), torch.Size([6, 3]))

In [14]:
# https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
attn_weights_2.unsqueeze(-1), attn_weights_2.unsqueeze(-1).shape, inputs.shape

(tensor([[0.1385],
         [0.2379],
         [0.2333],
         [0.1240],
         [0.1082],
         [0.1581]]),
 torch.Size([6, 1]),
 torch.Size([6, 3]))

In [15]:
# Same operation as above but in one-go.
attn_weights_2.unsqueeze(-1).T @ inputs

tensor([[0.4419, 0.6515, 0.5683]])