In [1]:
import torch

In [2]:
# your journey starts with one step
torch.manual_seed(123)
inputs = torch.rand((6,3))

In [3]:
# First way
# here x_2 is called the query and we do a dot operation
x_2 = inputs[1] # query
print(f"shape of x_2: {x_2.shape}")
attention_scores1 = torch.empty(len(inputs))
for i, inp in enumerate(inputs):
    attention_scores1[i] = (x_2 @ inp)

shape of x_2: torch.Size([3])


In [4]:
attention_scores1

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])

In [5]:
# Second way
# something that I learned is the dot method below, 
# Computes the dot product of two 1D tensors. and not 2D or more
# for that we can use @ , as you can see below in cell 7
x_2 = inputs[1]
attention_scores2 = torch.empty(len(inputs))
for i, inp in enumerate(inputs):
    attention_scores2[i] = torch.dot(x_2,inp)

In [6]:
attention_scores2

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])

In [7]:
# Third way
attention_scores3= inputs @ x_2
print(attention_scores3)

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])


In [8]:
inputs[0]

tensor([0.2961, 0.5166, 0.2517])

In [9]:
# dot product between 1d tensor
print(f"inputs[0]: {inputs[0]}")
print(f"x_2: {x_2}")
res = 0
for idx, val in enumerate(inputs[0]):
    res += inputs[0][idx] * x_2[idx] # query
print(res)

inputs[0]: tensor([0.2961, 0.5166, 0.2517])
x_2: tensor([0.6886, 0.0740, 0.8665])
tensor(0.4602)


In [10]:
inputs[0].shape

torch.Size([3])

In [11]:
x_2.shape

torch.Size([3])

In [12]:
inputs@ x_2 

tensor([0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023])

In [13]:
inputs.shape

torch.Size([6, 3])

In [14]:
# normalisation of attention scores converts them to attention weights
# at this point they sum to 1 and help in training stability
attention_weights = attention_scores3 / attention_scores3.sum()
print(f"attention scores: {attention_weights}")
# to show that the tensor now sums to 1, we can sum the full tensor
print(f"attention scores sum to one: {attention_weights.sum()}")

attention scores: tensor([0.1043, 0.2788, 0.0592, 0.2535, 0.0772, 0.2271])
attention scores sum to one: 1.0


In [15]:
# this is a full computation of attention scores
attn_scores = inputs @ inputs.T
attn_scores

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])

The following examination tries to understand why dim=0 or dim=1 resulted in the same attention weights. It was found that since we are calculating a matrix product of `inputs` and `inputs transpose` this results in a symmetrical matrix.
Now when we further calculate why the symmetrical matrix generates the same attention weights no matter how we sum it. I found that its primarily dependant on the sum of each row or column, which in this case generates the same ouput vector, see cells 28,29.

In [16]:
# manually calculating softmax
print(inputs[0])
print(torch.exp(inputs[0])/torch.exp(inputs[0]).sum(dim=0))


tensor([0.2961, 0.5166, 0.2517])
tensor([0.3122, 0.3892, 0.2986])


In [17]:
def softmax_naive(x,dim=0):
    return torch.exp(x)/torch.exp(x).sum(dim=dim)

In [18]:
attention_weights_naive_soft = softmax_naive(attention_scores3)

In [19]:
print(attention_weights_naive_soft)

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])


In [20]:
attention_weights_naive_soft_inputs1 = softmax_naive(attn_scores,dim=1)
print(attention_weights_naive_soft_inputs1)

tensor([[0.1748, 0.1174, 0.1609, 0.1340, 0.1603, 0.1268],
        [0.1824, 0.2536, 0.1817, 0.2365, 0.1838, 0.2342],
        [0.1324, 0.0962, 0.1490, 0.1000, 0.1429, 0.1070],
        [0.1997, 0.2268, 0.1811, 0.2319, 0.1827, 0.2110],
        [0.1411, 0.1042, 0.1529, 0.1079, 0.1511, 0.1179],
        [0.1697, 0.2019, 0.1743, 0.1896, 0.1793, 0.2032]])


In [21]:
attention_weights_naive_soft_inputs1.shape

torch.Size([6, 6])

In [22]:
print(attention_weights_naive_soft_inputs1[:, 0])
print(attention_weights_naive_soft_inputs1[:, 0].sum())

tensor([0.1748, 0.1824, 0.1324, 0.1997, 0.1411, 0.1697])
tensor(1.0000)


In [23]:
attention_weights_naive_soft_inputs2 = softmax_naive(attn_scores,dim=0)

In [24]:
print(attention_weights_naive_soft_inputs2[:, 0])
print(attention_weights_naive_soft_inputs2[:, 0].sum())

tensor([0.1748, 0.1824, 0.1324, 0.1997, 0.1411, 0.1697])
tensor(1.0000)


In [25]:
torch.exp(attn_scores)/torch.exp(attn_scores).sum(dim=1)

tensor([[0.1748, 0.1174, 0.1609, 0.1340, 0.1603, 0.1268],
        [0.1824, 0.2536, 0.1817, 0.2365, 0.1838, 0.2342],
        [0.1324, 0.0962, 0.1490, 0.1000, 0.1429, 0.1070],
        [0.1997, 0.2268, 0.1811, 0.2319, 0.1827, 0.2110],
        [0.1411, 0.1042, 0.1529, 0.1079, 0.1511, 0.1179],
        [0.1697, 0.2019, 0.1743, 0.1896, 0.1793, 0.2032]])

In [26]:
torch.exp(attn_scores)/torch.exp(attn_scores).sum(dim=0)

tensor([[0.1748, 0.1174, 0.1609, 0.1340, 0.1603, 0.1268],
        [0.1824, 0.2536, 0.1817, 0.2365, 0.1838, 0.2342],
        [0.1324, 0.0962, 0.1490, 0.1000, 0.1429, 0.1070],
        [0.1997, 0.2268, 0.1811, 0.2319, 0.1827, 0.2110],
        [0.1411, 0.1042, 0.1529, 0.1079, 0.1511, 0.1179],
        [0.1697, 0.2019, 0.1743, 0.1896, 0.1793, 0.2032]])

In [27]:
print(attn_scores)

tensor([[0.4179, 0.4602, 0.1397, 0.5509, 0.2036, 0.3884],
        [0.4602, 1.2304, 0.2611, 1.1189, 0.3408, 1.0023],
        [0.1397, 0.2611, 0.0630, 0.2580, 0.0887, 0.2193],
        [0.5509, 1.1189, 0.2580, 1.0992, 0.3343, 0.8977],
        [0.2036, 0.3408, 0.0887, 0.3343, 0.1445, 0.3155],
        [0.3884, 1.0023, 0.2193, 0.8977, 0.3155, 0.8600]])


In [28]:
torch.exp(attn_scores).sum(dim=1)

tensor([ 8.6883, 13.4977,  7.1457, 12.9435,  7.6481, 11.6328])

In [29]:

torch.exp(attn_scores).sum(dim=0)

tensor([ 8.6883, 13.4977,  7.1457, 12.9435,  7.6481, 11.6328])

In following 4 cells, I try to compute the attention weight for the first index, so as to understand the process at the more fundamental level, This builds the intuition.

In [30]:
torch.exp(attn_scores)

tensor([[1.5187, 1.5844, 1.1499, 1.7348, 1.2258, 1.4747],
        [1.5844, 3.4227, 1.2984, 3.0615, 1.4061, 2.7247],
        [1.1499, 1.2984, 1.0651, 1.2943, 1.0928, 1.2452],
        [1.7348, 3.0615, 1.2943, 3.0018, 1.3970, 2.4540],
        [1.2258, 1.4061, 1.0928, 1.3970, 1.1555, 1.3709],
        [1.4747, 2.7247, 1.2452, 2.4540, 1.3709, 2.3632]])

In [31]:
round(1.5187/8.688,4)

0.1748

In [32]:
round(1.5844/8.6883,4)

0.1824

In [33]:
round(1.5844/13.4997,4)

0.1174

Now lets use softmax function available within PyTorch, this softmax function is numerically stable, with better implementation when it encounters low values and large values.

In [34]:
attention_weights1 = torch.softmax(attention_scores3,dim=0)
print(attention_weights1)

tensor([0.1174, 0.2536, 0.0962, 0.2268, 0.1042, 0.2019])


In [35]:
attention_weights2 = torch.softmax(attn_scores,dim=0)
print(attention_weights2)

tensor([[0.1748, 0.1174, 0.1609, 0.1340, 0.1603, 0.1268],
        [0.1824, 0.2536, 0.1817, 0.2365, 0.1838, 0.2342],
        [0.1324, 0.0962, 0.1490, 0.1000, 0.1429, 0.1070],
        [0.1997, 0.2268, 0.1811, 0.2319, 0.1827, 0.2110],
        [0.1411, 0.1042, 0.1529, 0.1079, 0.1511, 0.1179],
        [0.1697, 0.2019, 0.1743, 0.1896, 0.1793, 0.2032]])


Now to get the context vectors, we need to consider the inputs and attention weights calculated uptil now.

In [36]:
print(attention_weights2.shape)
print(inputs.shape)
attention_weights2 @ inputs

torch.Size([6, 6])
torch.Size([6, 3])


tensor([[0.3150, 0.2043, 0.4230],
        [0.5332, 0.2701, 0.7136],
        [0.2522, 0.1631, 0.3466],
        [0.5071, 0.2725, 0.6718],
        [0.2716, 0.1740, 0.3734],
        [0.4460, 0.2396, 0.6048]])

In [37]:
attention_weights1 @ inputs

tensor([0.4762, 0.2052, 0.6228])

In [38]:
# lets do it on a loop
context_vec = torch.zeros(inputs[0].shape)
for idx, x_i in enumerate(inputs):
        context_vec += attention_weights1[idx] * x_i
        

In [39]:
context_vec

tensor([0.4762, 0.2052, 0.6228])