### Simple Calculation of attentation score

In [1]:
import torch
inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your (x^1)
    [0.55, 0.87, 0.66], # journey (x^2)
    [0.57, 0.85, 0.64], # starts (x^3)
    [0.22, 0.58, 0.33], # with (x^4)
    [0.77, 0.25, 0.10], # one (x^5)
    [0.05, 0.80, 0.55]] # step (x^6)
)

### Just for `journey`

In [2]:
query = inputs[1]
attention_score = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attention_score[i] = query @ x_i # Dot Product
## attention_score sum
print(attention_score.sum())
## normalize att score
attention_score_norm = attention_score/attention_score.sum() # Done to increase training stability, generally done using softmax
print(attention_score_norm)
## normalize att score sum
print(attention_score_norm.sum())

tensor(6.5617)
tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
tensor(1.0000)


In [3]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)
# better norm to handle extreme values, and ensure weights are +ve
atte_score_norm  = softmax_naive(attention_score) 
atte_score_norm

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])

In [4]:
# context vector
context_vector = torch.empty(inputs.shape[1])
for i, x_i in enumerate(inputs):
    context_vector += atte_score_norm[i] * x_i # Multiply the atten score for that input with input and sum them together
context_vector

tensor([0.4419, 0.6515, 0.5683])

### Atten and Context Calculation for everything naive

In [5]:
## norm att
att = torch.empty((inputs.shape[0], inputs.shape[0]))
for idx, query in enumerate(inputs):
    for i, x_i in enumerate(inputs):
        att[idx][i] = query @ x_i
att_norm = torch.softmax(att, dim=1) # torch softmax bro
# context vector
context = torch.empty(inputs.shape)
for idx, a_n in enumerate(att_norm):
    for i, x_i in enumerate(inputs):
        context[idx] += a_n[i] * x_i
context[1]

tensor([0.4419, 0.6515, 0.5683])

### Atten and Context Calculation for everything matrix

In [6]:
att = inputs @ inputs.T
att_norm = torch.softmax(att, dim=1) # torch softmax bro
context = att_norm @ inputs
context[1]

tensor([0.4419, 0.6515, 0.5683])

### Adding trainable parameters to the self-attentation

>from the book:
>
>**Weight parameters vs. attention weights**
>
>In the weight matrices W, the term “weight” is short for “weight parameters,” the val-
ues of a neural network that are optimized during training. This is not to be confused
with the attention weights. As we already saw, attention weights determine the extent
to which a context vector depends on the different parts of the input (i.e., to what
extent the network focuses on different parts of the input).
In summary, weight parameters are the fundamental, learned coefficients that define
the network’s connections, while attention weights are dynamic, context-specific values.

In [7]:
torch.manual_seed(123)
x_2 = inputs[1]
d_in = inputs.shape[1] # 3
d_out = 2
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

query_2 = x_2 @ W_query
print(query_2)

tensor([0.4306, 1.4551])


In [8]:
key = inputs @ W_key
value = inputs @ W_value

# We need to compute the att_score for journey with just key[1]
att_score_22 = query_2 @ key[1] 
att_score_22

tensor(1.8524)

In [14]:
# Getting actual att score for `journey`
att_score_2 = query_2 @ key.T
# Scaling the att
d_k = key.shape[-1]
attn_weights_2 = torch.softmax(att_score_2 / d_k**0.5, dim=-1)
context_vec2 = attn_weights_2 @ value
context_vec2

tensor([0.3061, 0.8210])