In [1]:
sentence = 'If something is humanly possible, it is attainable by you too'

# dictionary to map each word to a unique integer id
dict = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}

print(dict)

{'If': 0, 'attainable': 1, 'by': 2, 'humanly': 3, 'is': 5, 'it': 6, 'possible': 7, 'something': 8, 'too': 9, 'you': 10}


In [2]:
import torch 
sentence_int = torch.tensor(
    [dict[s] for s in sentence.replace(',', '').split()])

print (sentence_int) # a tensor of integer IDs

tensor([ 0,  8,  5,  3,  7,  6,  5,  1,  2, 10,  9])


In [3]:
vocab_size = 50000

torch.manual_seed(123)
embed = torch.nn.Embedding(vocab_size,3)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)       # embedding layer is kind of a look-up table. it turns word IDs into dense vectors
print(embedded_sentence.shape)

tensor([[ 0.3374, -0.1778, -0.3035],
        [ 0.4965, -1.5723,  0.9666],
        [ 0.2692, -0.0770, -1.0205],
        [-1.1925,  0.6984, -1.4097],
        [ 1.3010,  1.2753, -0.2010],
        [-0.1690,  0.9178,  1.5810],
        [ 0.2692, -0.0770, -1.0205],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        [-0.6315, -2.8400, -1.3250],
        [-1.1481, -1.1589,  0.3255]])
torch.Size([11, 3])


In [4]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]     #input embedding dimension

dq, dk, dv = 2, 2, 4     # output dimensions for q, k, v

Wquery = torch.nn.Parameter(torch.rand(d, dq))    # learnable weight matrices for query,key and value
Wkey = torch.nn.Parameter(torch.rand(d,dk))
Wvalue = torch.nn.Parameter(torch.rand(d, dv))


In [5]:
print (Wquery)
print(Wkey)
print(Wvalue)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]], requires_grad=True)
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]], requires_grad=True)
Parameter containing:
tensor([[0.0756, 0.1966, 0.3164, 0.4017],
        [0.1186, 0.8274, 0.3821, 0.6605],
        [0.8536, 0.5932, 0.6367, 0.9826]], requires_grad=True)


In [6]:
# projecting embeddings for a single word into a query, key and value vector
x_2 = embedded_sentence[1]
query_2 = x_2 @ Wquery
key_2 = x_2 @ Wkey
value_2 = x_2 @ Wvalue

print(query_2.shape)
print(key_2.shape)
print(value_2.shape)

torch.Size([2])
torch.Size([2])
torch.Size([4])


In [7]:
# now doing the same for the entire sentence
keys = embedded_sentence @ Wkey
values = embedded_sentence @ Wvalue

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

keys.shape: torch.Size([11, 2])
values.shape: torch.Size([11, 4])


In [8]:
omega_24 = query_2.dot(keys[4]) #omega (w) is the unnormalized attn weight / attention score / measure of similarity b/w query and key
print(omega_24)

tensor(-0.0514, grad_fn=<DotBackward0>)


In [9]:
# calculating attention score for a single word (secodn word) against all words in the sentence
omega_2 = query_2 @ keys.T 
print(omega_2)

tensor([ 0.0111, -0.0196,  0.0447,  0.0782, -0.0514, -0.0944,  0.0447, -0.0267,
        -0.0226,  0.1473,  0.0390], grad_fn=<SqueezeBackward4>)


In [10]:
import torch.nn.functional as F

attention_weights_2 = F.softmax(omega_2 / dk**0.5, dim=0)   # scale attn scores(omega) and convert them into a probability distribution

print(attention_weights_2)

tensor([0.0907, 0.0887, 0.0928, 0.0951, 0.0867, 0.0841, 0.0928, 0.0883, 0.0885,
        0.0998, 0.0925], grad_fn=<SoftmaxBackward0>)


In [11]:
context_vector_2 = attention_weights_2 @ values    # this vector(for 2nd word) is a weighted sum of the value vectors from all words
print(context_vector_2.shape)
print(context_vector_2)

torch.Size([4])
tensor([-0.1538, -0.3559, -0.2394, -0.3801], grad_fn=<SqueezeBackward4>)


In [12]:
# wrapping the entire SelfAttention mechanism in a class

import torch.nn as nn

class SelfAttention(nn.Module):

    def __init__ (self, d_in, d_out_qk, d_out_v):
        super().__init__()
        self.d_out_qk = d_out_qk
        self.Wquery = nn.Parameter(torch.rand(d_in, d_out_qk))
        self.Wkey = nn.Parameter(torch.rand(d_in, d_out_qk))
        self.Wvalue = nn.Parameter(torch.rand(d_in, d_out_v))

    def forward(self, x):
        keys = x @ self.Wkey
        queries = x @ self.Wquery
        values = x @ self.Wvalue

        attn_scores = queries @ keys.T  # unnormalised attention weights
        attn_weights = torch.softmax(attn_scores / self.d_out_qk**0.5, dim=-1)

        context_vector = attn_weights @ values
        return context_vector
        
        

In [13]:
torch.manual_seed(123)
d_in, d_out_qk, d_out_v = 3,2,4

sa = SelfAttention(d_in, d_out_qk, d_out_v)
print(sa(embedded_sentence))          #look at the second row and it exactly matches the value from our calculations previously

tensor([[-0.2361, -0.5167, -0.3465, -0.5528],
        [-0.1538, -0.3559, -0.2394, -0.3801],
        [-0.6448, -1.3095, -0.8819, -1.4126],
        [-1.0799, -2.2338, -1.4854, -2.3864],
        [ 0.7037,  0.9007,  0.6967,  1.1096],
        [ 0.8501,  1.0934,  0.8401,  1.3405],
        [-0.6448, -1.3095, -0.8819, -1.4126],
        [ 0.1525,  0.1379,  0.1183,  0.1896],
        [ 0.0212, -0.0659, -0.0313, -0.0484],
        [-1.5055, -3.2335, -2.1128, -3.4056],
        [-1.0046, -2.0491, -1.3719, -2.2012]], grad_fn=<MmBackward0>)
