## ----------------------------Self-Attention Mechanism Trainable Weights--------------------------------

## KEY NOTES

- In this tutorial the weights are going to shift the inputs from the third dimension to the second dimension(R3 --> R2)
- We are going to follow the rules of linear algebra --- kama hujui hii we iza tu but ni easy but its a good background

## Creating the input = token embeddings

In [2]:
# Creating the token embeddings - Randomized
import torch
output_dim = 3

inputs = torch.tensor([
    [0.43, 0.15, 0.89], # Your    # X1
    [0.55, 0.87, 0.66], # journey # x2
    [0.57, 0.85, 0.64], # begins  # X3
    [0.22, 0.58, 0.33], # with    # X4
    [0.77, 0.25, 0.10], # one     # X5
    [0.05, 0.80, 0.55] # step     # X6
])  
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

## Initialization the key, query and value weights

In [3]:
# Lets check the shape of the inputs

print(f"The inputs has shape = {inputs.shape}")
print("The weights will have a shape of ([3, 2])")
print("The outputs key, value, query vectors shape will be ([6, 3])")

The inputs has shape = torch.Size([6, 3])
The weights will have a shape of ([3, 2])
The outputs key, value, query vectors shape will be ([6, 3])


In [4]:
# Getting the shape of the weight matrices

dimension_in = inputs.shape[-1] 
dimension_out = 2
dimension_in, dimension_out

(3, 2)

- **Note1:** In GPT-Like models the input and output dimensions are usually same
- But for me i want to test the transformation between different dimensional spaces
- **Note2:** We are setting the requires gradients to False because we are not plannning on optimizing the values
- Backpropagation, my friend it is you who might seeing this repo am telling me i already know, is when we calculate the loss and modify the values so that the values converge to zero (minimize loss)

In [5]:
# Initializing the random weights matrices

# Setting the random seed
random_seed = 42
torch.manual_seed(random_seed)
# Initialization
w_query = torch.nn.Parameter(torch.rand(dimension_in, dimension_out), requires_grad=False)
w_key = torch.nn.Parameter(torch.rand(dimension_in, dimension_out), requires_grad=False)
w_value = torch.nn.Parameter(torch.rand(dimension_in, dimension_out), requires_grad=False)
# Checking an example and its shape -  must be viable for matrix mulitiplication with the inputs
w_query.shape, w_query

(torch.Size([3, 2]),
 Parameter containing:
 tensor([[0.8823, 0.9150],
         [0.3829, 0.9593],
         [0.3904, 0.6009]]))

## Calculating Key, Query and Value matrices - matrix multiplication

matrice = inputs * weight_matrix

In [6]:
# Calculating the key, query vector for an individual token

token = inputs[0]

query_v_token = token @ w_query
key_v_token = token @ w_key
value_v_token = token @ w_value

In [7]:
# Performing the matrix multiplications
query_matrix = inputs @ w_query
key_matrix = inputs @ w_key
value_matrix = inputs @ w_value
# Checking the results using one example
query_matrix, query_matrix.shape

(tensor([[0.7843, 1.0721],
         [1.0760, 1.7344],
         [1.0782, 1.7215],
         [0.5450, 0.9560],
         [0.8141, 1.0045],
         [0.5652, 1.1437]]),
 torch.Size([6, 2]))

In [8]:
value_matrix

tensor([[1.2731, 0.8193],
        [1.7073, 1.0646],
        [1.6922, 1.0559],
        [0.9133, 0.5633],
        [0.9433, 0.6019],
        [1.1233, 0.6876]])

In [9]:
# Testing if the complete and individual coincide

query_v_token == query_matrix[0]

tensor([True, True])

## Obtaining attention score

In [10]:
# Let each row represent the query vector for a token

query_matrix

tensor([[0.7843, 1.0721],
        [1.0760, 1.7344],
        [1.0782, 1.7215],
        [0.5450, 0.9560],
        [0.8141, 1.0045],
        [0.5652, 1.1437]])

In [16]:
# Let each row represent the key vector for each token

key_matrix, key_matrix.shape

(tensor([[1.0832, 0.8895],
         [1.5764, 0.9441],
         [1.5440, 0.9455],
         [0.9105, 0.4477],
         [0.5262, 0.7038],
         [1.2795, 0.4727]]),
 torch.Size([6, 2]))

In [17]:
query_matrix[1].shape, key_matrix.T.shape

(torch.Size([2]), torch.Size([2, 6]))

In [28]:
# Lets get the attention score for the second query
token = query_matrix[1]
attention_score_query2 = token @ key_matrix.T
attention_score_query2

tensor([2.7084, 3.3338, 3.3013, 1.7563, 1.7869, 2.1966])

In [29]:
# What the above tensor means
#2.7084 - how much the first token attends to the second token(query)
#3.3338 - how much the second token attends to the query
#and so on so forth

In [30]:
# Lets get the attention scores for all the queries

all_attention_scores = query_matrix @ key_matrix.T
all_attention_scores

tensor([[1.8033, 2.2486, 2.2247, 1.1941, 1.1672, 1.5103],
        [2.7084, 3.3338, 3.3013, 1.7563, 1.7869, 2.1966],
        [2.6993, 3.3251, 3.2925, 1.7525, 1.7789, 2.1933],
        [1.4408, 1.7618, 1.7454, 0.9243, 0.9596, 1.1492],
        [1.7754, 2.2317, 2.2067, 1.1910, 1.1353, 1.5164],
        [1.6295, 1.9707, 1.9539, 1.0266, 1.1023, 1.2637]])

In [31]:
# Each row shows how the other keys relate to the query just like the one token attention score but now for all tokens not just the second

In [27]:
# This is the similar to the above formula just longer

b = torch.empty([6, 6])
for i, x_i in enumerate(query_matrix):
    for j, x_j in enumerate(key_matrix):
        b[i, j] = torch.dot(x_i, x_j)
b == all_attention_scores

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])

## Scaled dot product attention

We scale the attention scores by dividing them by the squareroot of the embedding dimension of the keys

## Reasons for scaling

1. Stability in learning - softmax is sensitive to the maginitude of the inputs in that when the inputs are large the difference btwn the exponential values of each input becomes much more pronounce and the highest values receives almost all the probability mass
2. Reduce the values - if the dot product between the query and keys becomes to large it results in a sharp softmax distribution making the model overly confident in one particular key this can make learning unstable

In [49]:
# Stability learning example

x = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])
a = torch.softmax(x, dim=-1)
y = x * 8
b = torch.softmax(y, dim=-1)
a, b

(tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872]),
 tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000]))

## But why squareroot(dimension)

This is to make the variance of the dot product stable

The variance of the dot product of query and key increases because multiplying two random numbers increases variance

The variance increases with dimension 

Dividing the product / sqrt(dimension) keeps the variance close to 1

In [57]:
import numpy as np

def getVariance(dim, num_trials=1000):
    dot_products = []
    scaled_dot_products = []
    for _ in range(num_trials):
        q = np.random.rand(dim)
        k = np.random.rand(dim)
        dot_product = np.dot(q, k)
        scaled_dot_product = dot_product / np.sqrt(dim)
        dot_products.append(dot_product)
        scaled_dot_products.append(scaled_dot_product)
    scaled_var = np.var(scaled_dot_products)
    unscaled_var = np.var(dot_products)
    return scaled_var, unscaled_var

variance_scaled_5, variance_unscaled_5 = getVariance(5)
variance_scaled_20, variance_unscaled_20 = getVariance(20)
print(f"Dimensions = 5 \t: variance after = {variance_scaled_5} variance before = {variance_unscaled_5}")
print(f"Dimensions = 20 : variance after = {variance_scaled_20} variance before = {variance_unscaled_20}")

Dimensions = 5 	: variance after = 0.0457792164085009 variance before = 0.22889608204250453
Dimensions = 20 : variance after = 0.05038295061271449 variance before = 1.0076590122542899


In [39]:
# Getting the dimension of the keys

d_k = key_matrix.shape[-1]
d_k

2

In [44]:
# Scaling the attention scores

scaled_attention_scores = all_attention_scores / d_k**0.5
scaled_attention_scores

tensor([[1.2751, 1.5900, 1.5731, 0.8444, 0.8254, 1.0679],
        [1.9152, 2.3574, 2.3344, 1.2419, 1.2635, 1.5533],
        [1.9087, 2.3512, 2.3281, 1.2392, 1.2579, 1.5509],
        [1.0188, 1.2457, 1.2342, 0.6536, 0.6785, 0.8126],
        [1.2554, 1.5781, 1.5604, 0.8422, 0.8028, 1.0723],
        [1.1523, 1.3935, 1.3816, 0.7259, 0.7794, 0.8936]])

## Attention Weights - Normalizing all attention scores

In [45]:
scaled_attention_scores.shape

torch.Size([6, 6])

In [46]:
attention_weights = torch.softmax(scaled_attention_scores, dim=-1)
attention_weights

tensor([[0.1719, 0.2355, 0.2315, 0.1117, 0.1096, 0.1397],
        [0.1723, 0.2681, 0.2620, 0.0879, 0.0898, 0.1200],
        [0.1721, 0.2679, 0.2618, 0.0881, 0.0898, 0.1203],
        [0.1750, 0.2196, 0.2171, 0.1215, 0.1245, 0.1424],
        [0.1704, 0.2353, 0.2312, 0.1127, 0.1084, 0.1419],
        [0.1772, 0.2255, 0.2228, 0.1157, 0.1220, 0.1368]])

In [47]:
# Testing using the first row
b = 0
for i in attention_weights:
    for j in i:
        b += j
    break    
b

tensor(1.)

## Computing the Context Vector

We compute it as a weighted sum over the value vectors

context vector = attention weights matrix * values matrix

In [61]:
# Computing the full context vector matrix

all_context_vector = attention_weights @ value_matrix
all_context_vector

tensor([[1.3751, 0.8610],
        [1.4201, 0.8892],
        [1.4198, 0.8890],
        [1.3533, 0.8476],
        [1.3746, 0.8606],
        [1.3620, 0.8532]])

In [62]:
# Lets compute for the second input token only

token_attention_weight = attention_weights[1]
context_vec_2 = token_attention_weight @ value_matrix
context_vec_2 == all_context_vector[1]

tensor([True, True])

## Compacting Self attention mechanism into a Pytorch class

In [68]:
from torch import nn
# Defining the class
class SelfAttentionV1(nn.Module):
    # Creating the initializer
    def __init__(self, d_in, d_out):
        # Inheritance
        super().__init__()
        # Initializer the 3 key weight matrices
        self.w_query = torch.nn.Parameter(torch.rand(d_in, d_out))
        self.w_key = torch.nn.Parameter(torch.rand(d_in, d_out))
        self.w_value = torch.nn.Parameter(torch.rand(d_in, d_out))
    # A forward pass for the inputs
    def forward(self, x):
        # Computing the query key and values
        keys = x @ self.w_key 
        values = x @ self.w_value
        queries = x @ self.w_query
        # Computing the attention scores
        attn_score = queries @ keys.T
        # Computing the attention weights
        attn_weights = torch.softmax(
            attn_score / keys.shape[-1]**0.5,
            dim = -1
        )
        # Computing the context vector
        context_vec = attention_weights @ values
        # Returning the context vector
        return context_vec

torch.manual_seed(42)
# Initialization
self_v1 = SelfAttentionV1(d_in=3, d_out=2)
context_vectors = self_v1(inputs)
context_vectors

tensor([[1.3751, 0.8610],
        [1.4201, 0.8892],
        [1.4198, 0.8890],
        [1.3533, 0.8476],
        [1.3746, 0.8606],
        [1.3620, 0.8532]], grad_fn=<MmBackward0>)

## Linear layers improvement

- We can improve further the self attention version 1 by replacing nn.Parameter with nn.Linear which effectively perform matrix multiplication when the bias units are disabled

- Additionally, a significant advantage of use of nn.Linear instead of nn.Parameter(torch.rand()..) is that it produces optimized weights enhancing more stable learning

In [70]:
from torch import nn
# Defining the class
class SelfAttentionV2(nn.Module):
    # Creating the initializer
    def __init__(self, d_in, d_out, bias_units=False):
        # Inheritance
        super().__init__()
        # Initializer the 3 key weight matrices
        self.w_query = torch.nn.Linear(d_in, d_out, bias=bias_units)
        self.w_key = torch.nn.Linear(d_in, d_out, bias=bias_units)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=bias_units)
    # A forward pass for the inputs
    def forward(self, x):
        # Computing the query key and values
        keys = x @ self.w_key 
        values = x @ self.w_value
        queries = x @ self.w_query
        # Computing the attention scores
        attn_score = queries @ keys.T
        # Computing the attention weights
        attn_weights = torch.softmax(
            attn_score / keys.shape[-1]**0.5,
            dim = -1
        )
        # Computing the context vector
        context_vec = attn_weights @ values
        # Returning the context vector
        return context_vec

torch.manual_seed(42)
# Initialization
self_v2 = SelfAttentionV2(d_in=3, d_out=2)
context_vectors = self_v1(inputs)
context_vectors

tensor([[1.3751, 0.8610],
        [1.4201, 0.8892],
        [1.4198, 0.8890],
        [1.3533, 0.8476],
        [1.3746, 0.8606],
        [1.3620, 0.8532]], grad_fn=<MmBackward0>)