Reference: https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html

In [1]:
import torch

In [2]:
### Creating a dict of word to index

In [21]:
sentence = "Life is short, eat dessert first"

dc = {s:i for i,s in enumerate(sorted(sentence.replace(",", '').split()))}

print(dc)

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}


In [4]:
### Creating a sentence embedding vector

In [5]:
sentence_int = torch.tensor([dc[s] for s in sentence.replace(",", '').split()])
print(sentence_int)

tensor([0, 4, 5, 2, 1, 3])


In [6]:
### Next we will use a embedding layer to encode the inputs to real-vector embedding
# Here we are using 16-dim embeddings so that each input is represented by a 16-dim vector

In [7]:
torch.manual_seed(123)
embed = torch.nn.Embedding(6, 16)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)
print(embedded_sentence.shape)

tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
          0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692],
        [ 0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,
          0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465],
        [ 0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,
         -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729],
        [-1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,
          0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850],
        [-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,
          0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400],
        [ 0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,
          2.2473, -0.8036, -0.2808,  0.7697, -0.6596, -0.7979,  0.1838,  0.2293]])
torch.Size([6, 16])


In [8]:
### Defining the weight matrix
"""
Self-attention utilizes 3 weight matrices - Uq, Uk and Uv, which are adjusted as model params during training. 
There matrics project the input to query, key and value components of the sequence.

The respective query, key and value sequences are obtained via matrix multiplication between the weight matrics U and the embedded input
x:

Query sentence : q(i) -> Uqx(i) for i in [1, T], where T is the length of input sequence
Key sentence :   k(i) -> Ukx(i) for i in [1, T]
Value sentence:  v(i) -> Uvx(i) for i in [1, T]

q(i) and k(i) have the dimension - d(k) * d
v(i) has the dimension - d(v) * d
For simplicity lets taje - d(k) = d(v) = d
where d represents size of each word vector that is 16 above.
"""

'\nSelf-attention utilizes 3 weight matrices - Uq, Uk and Uv, which are adjusted as model params during training. \nThere matrics project the input to query, key and value components of the sequence.\n\nThe respective query, key and value sequences are obtained via matrix multiplication between the weight matrics U and the embedded input\nx:\n\nQuery sentence : q(i) -> Uqx(i) for i in [1, T], where T is the length of input sequence\nKey sentence :   k(i) -> Ukx(i) for i in [1, T]\nValue sentence:  v(i) -> Uvx(i) for i in [1, T]\n\nq(i) and k(i) have the dimension - d(k) * d\nv(i) has the dimension - d(v) * d\nFor simplicity lets taje - d(k) = d(v) = d\nwhere d represents size of each word vector that is 16 above.\n'

In [9]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)

In [10]:
### Computing the unnormalized attention weights

In [11]:
x_2 = embedded_sentence[1]
query_2 = U_query.matmul(x_2)
key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)

In [12]:
print(query_2.shape)
print(key_2.shape)
print(value_2.shape)

torch.Size([16])
torch.Size([16])
torch.Size([16])


In [13]:
"""
Generalizing the above to the entire input
"""

'\nGeneralizing the above to the entire input\n'

In [14]:
keys = U_key.matmul(embedded_sentence.T).T
values = U_value.matmul(embedded_sentence.T).T

In [15]:
print(keys.shape)
print(values.shape)

torch.Size([6, 16])
torch.Size([6, 16])


In [16]:
### Computing the unnormalized attention weights - w
"""
q(2).T . k(1) -> w 2,1
q(2).T . k(2) -> w 2,2
"""

'\nq(2).T . k(1) -> w 2,1\nq(2).T . k(2) -> w 2,2\n'

In [17]:
omega_2 = query_2.matmul(keys.T)
print(omega_2)

tensor([-1.1880, -1.3225,  6.3126, -6.8690, -5.6129, -6.2390])


In [18]:
### Computing the Attention Scores
"""
a2,i = softmax(w2,i/ sq(d(k)))
"""

'\na2,i = softmax(w2,i/ sq(d(k)))\n'

In [19]:
import torch.nn.functional as F

attention_weights_2 =  F.softmax(omega_2 / d**0.5, dim=0)
print(attention_weights_2)

tensor([0.1070, 0.1035, 0.6980, 0.0259, 0.0354, 0.0303])


In [20]:
print("Done")

Done
