# **Implementing Self-attention with trainable weights**

In [2]:
import torch

In [4]:
words = ['Your', 'journey', 'starts', 'with', 'one', 'step']

inputs = torch.tensor(
  [
   [0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55], # step     (x^6)
  ]
)

In [5]:
inputs.shape

torch.Size([6, 3])

In [6]:
x_2 = inputs[1] # A
d_in = inputs.shape[1] # B
d_out = 2 # C

In [None]:
torch.manual_seed(123)
# the training weight we're using here is 3x2. the first rows, 3 has to match the vector dimension of the input vector dimenstion. in our case 3, but the second dimension can be anything
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # 3x2
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [8]:
W_query

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])

In [9]:
W_key

Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])

In [10]:
W_value

Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])

In [11]:
# computing for only the second in the input x_2
query_2 = x_2 @ W_query # 1x3 @ 3x2 = 1x2
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
query_2, key_2, value_2

(tensor([0.4306, 1.4551]), tensor([0.4433, 1.1419]), tensor([0.3951, 1.0037]))

In [12]:
# computing key value for all
queries = inputs @ W_query
keys = inputs @ W_key
values = inputs @ W_value

queries.shape, keys.shape, values.shape

(torch.Size([6, 2]), torch.Size([6, 2]), torch.Size([6, 2]))