<div style="background-color:#ffecd2; color:#355c7d; text-align:center; padding:15px; font-size:25px; border-radius:25px; "> Self Attention Mechanism</div>

In [2]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

    
#A The second input element

#B The input embedding size, d=3


#C The output embedding size, d_out=2

Note that in GPT-like models, the input and output dimensions are usually the same. 

But for illustration purposes, to better follow the computation, we choose different input (d_in=3)
and output (d_out=2) dimensions here.

In [3]:
x_2 = inputs[1] #A
d_in  = inputs.shape[1]#B
d_out = 2 #C

In [4]:
torch.manual_seed(123)

W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # requies_grad = False to reduce clutter
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # 
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [5]:
print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [6]:
print(W_key)

Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])


In [7]:
print(W_value)

Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [10]:
query_2 = x_2@W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

print(query_2) # d out -2 

tensor([0.4306, 1.4551])


In [13]:
keys = inputs @ W_key
values = inputs @ W_value
query = inputs @ W_query

print(f"Shape of Keys matrix :{keys.shape}")
print(f"Keys matrix :{keys}")

print(f"Shape of Values matrix :{values.shape}")
print(f"Values matrix :{values}")

print(f"Shape of Query matrix :{query.shape}")
print(f"Query matrix :{query}")



Shape of Keys matrix :torch.Size([6, 2])
Keys matrix :tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
Shape of Values matrix :torch.Size([6, 2])
Values matrix :tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])
Shape of Query matrix :torch.Size([6, 2])
Query matrix :tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])


In [17]:
keys_2 = keys[1]
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.8524)


In [18]:
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [21]:
attn_scores = query @ keys.T
print(f"Attention Scores : {attn_scores}")

Attention Scores : tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


The difference to earlier is
that we now scale the attention scores by dividing them by the square root of the
embedding dimension of the keys. 


In [24]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2/d_k*0.5,dim=-1)
print(f"Dimension :{d_k}")
print(attn_weights_2)

Dimension :2
tensor([0.1623, 0.1877, 0.1858, 0.1547, 0.1358, 0.1738])


WHY DIVIDE BY SQRT (DIMENSION)


The softmax function is sensitive to the magnitudes of its inputs. When the inputs are large, the differences between the exponential values of each input become much more pronounced. This causes the softmax output to become "peaky," where the highest value receives almost all the probability mass, and the rest receive very little.

In attention mechanisms, particularly in transformers, if the dot products between query and key vectors become too large (like multiplying by 8 in this example), the attention scores can become very large. This results in a very sharp softmax distribution, making the model overly confident in one particular "key." Such sharp distributions can make learning unstable,



To make the variance of the dot product stable

The dot product of  Q and K increases the variance because multiplying two random numbers increases the variance.

The increase in variance grows with the dimension. 

Dividing by sqrt (dimension) keeps the variance close to 1
    

In [26]:
import torch

tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

softmax_tensor = torch.softmax(tensor,dim =-1)
print(f"Softmax  : {softmax_tensor}")


scaled_tensor = tensor*8
softmax_scaled_tensor = torch.softmax(scaled_tensor,dim = -1)
print(f"Softmax Scaled Tensor : {softmax_scaled_tensor}")

Softmax  : tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax Scaled Tensor : tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


In [29]:
# To illustate the impact of dividing by the square root of dimension 

import numpy as np

def compute_variance(dim,num_trials = 1000):
    dot_products = []
    scaled_dot_products = []

    for _ in range(num_trials):
        q = np.random.randn(dim)
        k = np.random.randn(dim)

        dot_product = np.dot(q,k)
        dot_products.append(dot_product)

        scaled_dot_product = dot_product /np.sqrt(dim)
        scaled_dot_products.append(scaled_dot_product)
    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)
    return variance_before_scaling , variance_after_scaling


variance_before_5, variance_after_5 = compute_variance(5)
print(f"Variance before scaling (dim=5): {variance_before_5}")
print(f"Variance after scaling (dim=5): {variance_after_5}")

variance_before_100, variance_after_100 = compute_variance(100)
print(f"Variance before scaling (dim=100): {variance_before_100}")
print(f"Variance after scaling (dim=100): {variance_after_100}")



Variance before scaling (dim=5): 5.003549047510514
Variance after scaling (dim=5): 1.0007098095021028
Variance before scaling (dim=100): 93.78574651821704
Variance after scaling (dim=100): 0.9378574651821705


In [30]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.2896, 0.7811])


<div style="background-color:#ffecd2; color:#355c7d; text-align:center; padding:15px; font-size:25px; border-radius:25px; ">Self Attention Mechanism</div>

In [31]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T 
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

In [32]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


PyTorch's
nn.Linear layers, which effectively perform matrix multiplication when the bias units are
disabled. 

In [33]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [34]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


Note that SelfAttention_v1 and SelfAttention_v2 give different outputs because they
use different initial weights for the weight matrices since nn.Linear uses a more
sophisticated weight initialization scheme.