In [1]:
import torch
import torch.nn as nn
import numpy

import warnings 
warnings.filterwarnings("ignore")

torch.manual_seed(123)

<torch._C.Generator at 0x107d4bf50>

In [2]:
in_tensor = torch.rand(5, 5)
in_tensor

tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740],
        [0.8665, 0.1366, 0.1025, 0.1841, 0.7264],
        [0.3153, 0.6871, 0.0756, 0.1966, 0.3164],
        [0.4017, 0.1186, 0.8274, 0.3821, 0.6605],
        [0.8536, 0.5932, 0.6367, 0.9826, 0.2745]])

In [3]:
#attn_score
attn_score = in_tensor @ in_tensor
# norm_attn = attn_score/attn_score.sum(dim=0)
# norm_attn

In [4]:
attn_weight = torch.softmax(attn_score, dim = -1)
attn_weight

tensor([[0.2361, 0.1532, 0.1950, 0.1802, 0.2355],
        [0.2166, 0.1937, 0.1693, 0.2997, 0.1207],
        [0.2752, 0.1601, 0.1601, 0.2004, 0.2042],
        [0.2214, 0.2280, 0.1661, 0.2343, 0.1501],
        [0.2399, 0.1678, 0.1804, 0.2108, 0.2011]])

In [5]:
context_vecs = attn_weight @ in_tensor
context_vecs

tensor([[0.5375, 0.4379, 0.3889, 0.5294, 0.3741],
        [0.5088, 0.3618, 0.4120, 0.4512, 0.4414],
        [0.5255, 0.4189, 0.3936, 0.5277, 0.3757],
        [0.5378, 0.3765, 0.3811, 0.4641, 0.4305],
        [0.5296, 0.4151, 0.3937, 0.5097, 0.3911]])

### Attention with  Trainable weights

In [6]:
x_2 = in_tensor[1]
d_in = in_tensor.shape[1]
d_out = 2

In [7]:
#the w, q, v matrices are initialized as parameter of the attention module

Q = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
K = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
V = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

#setting require_grad = False because not training, if training set as True

print(Q.shape)
print(K.shape)
print(V.shape)

torch.Size([5, 2])
torch.Size([5, 2])
torch.Size([5, 2])


In [8]:
query = in_tensor @ Q
key = in_tensor @ K
value = in_tensor @ V

print(f"Query : {query}")
print(f"Key : {key}")
print(f"Value : {value}")

Query : tensor([[1.2085, 1.3420],
        [1.3403, 1.2556],
        [1.1682, 1.2064],
        [1.1469, 1.8641],
        [2.0150, 2.3172]])
Key : tensor([[0.6785, 0.5486],
        [1.2294, 0.5470],
        [0.7758, 0.3918],
        [1.3894, 0.8629],
        [1.4164, 0.9855]])
Value : tensor([[1.1248, 1.2415],
        [1.2242, 0.9943],
        [0.9601, 1.0932],
        [1.6329, 1.3659],
        [2.0849, 2.0791]])


In [9]:
attn_score = query @ key.T
attn_score

tensor([[1.5562, 2.2198, 1.4633, 2.8371, 3.0344],
        [1.5982, 2.3345, 1.5317, 2.9456, 3.1358],
        [1.4544, 2.0960, 1.3789, 2.6640, 2.8436],
        [1.8008, 2.4295, 1.6200, 3.2019, 3.4615],
        [2.6384, 3.7446, 2.4710, 4.7990, 5.1377]])

In [10]:
#calculating dimension of key to normalize the vector before calculating softmax

# reason for normalizing (dividing by key dimension): Improve the training performance, by avoiding small gradients. Let's say when scaling up the embedding dim
# which is typically greater in LLM ~1000 like in GPT, the dot product can result in very small gradients during backprop because of the softmax applied to them.

dim = key.shape[-1]
attn_weight2 = torch.softmax(attn_score/(dim ** 0.5), dim=-1)
attn_weight2

tensor([[0.1130, 0.1806, 0.1058, 0.2794, 0.3213],
        [0.1087, 0.1830, 0.1037, 0.2819, 0.3225],
        [0.1170, 0.1842, 0.1109, 0.2753, 0.3125],
        [0.1067, 0.1665, 0.0939, 0.2875, 0.3454],
        [0.0688, 0.1504, 0.0611, 0.3170, 0.4027]])

In [12]:
context_vec = attn_weight2 @ value
context_vec

tensor([[1.5758, 1.4850],
        [1.5789, 1.4861],
        [1.5648, 1.4755],
        [1.6036, 1.5115],
        [1.6774, 1.5720]])