-------------------------------
# SIMPLIFIED ATTENTION MECHANISM
--------------------------------

Consider the following input sentence,
Your JOURNEY starts with one step

We will choose a small embedding dimension, a 3 - dimentional vector.

In [3]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

##### attention score calculation

In [4]:
# Firstly we will try to compute the attention scores, which can u simply done using dot product
query = inputs[1] # we are taking 2nd input token as the query rn

attention_score_2 = torch.empty(inputs.shape[0])

for i, x_i in enumerate(inputs):
    attention_score_2[i] = torch.dot(x_i, query) # Dot product formula ---> a1​b1​+a2​b2​+a3​b3​ (algebraic representation) and/or abcosθ (geometric representation)

print(attention_score_2)

# magic : now these printed tensors are basically the attention scores, we take the dot product using abcos0, 
#         higher the attenton score, more similar the words are (similar is not the correct word to be used here, relevant is...)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


#### Implementing Simple Normalisation

In [5]:
attention_score_2_tmp = attention_score_2 / attention_score_2.sum() # We do this to get every tensor value sum upto 1

print("attention weights: ", attention_score_2_tmp)
print("sum: ", attention_score_2_tmp.sum())


attention weights:  tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
sum:  tensor(1.0000)


#### Softmax function

In [6]:
# in practive we use softmax function for normalisation, this approach is better at managing extreme values and offers more favourable gradient properties

def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attention_weights_2_naive = softmax_naive(attention_score_2)

print("Attention weights:", attention_weights_2_naive)
print("sum:", attention_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum: tensor(1.)


In [7]:
# the softmax func ensures that the attention weights are always positive. This makes the output interpretable as probabilities or relativve importance, 
# where higher weights indicates greater importance
# Now we will use Pytorch implementation of softmax, which is more effective

attention_weights_2 = torch.softmax(attention_score_2, dim=0)

print("Attention weights:", attention_weights_2)
print("sum:", attention_weights_2.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum: tensor(1.)


#### Implementing context vector

In [8]:
query = inputs[1]

context_vector_2 = torch.zeros(query.shape)

for i, x_i in enumerate(inputs):
    context_vector_2 += attention_weights_2[i]*x_i

print(context_vector_2)
print(context_vector_2.shape)

tensor([0.4419, 0.6515, 0.5683])
torch.Size([3])


In [9]:
# Now we will do this computation to calc attn weights for all the input tokens

In [10]:
# First we add addn for-loop to compute the dot procuts for all pairs of inputs

attention_scores = torch.empty(6, 6)
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attention_scores[i ,j] = torch.dot(x_i, x_j)

print(attention_scores)
print(attention_scores.shape)


tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
torch.Size([6, 6])


In [11]:
# This for loop method is computationally slow, so we use Transpose matrix func to make it work
# We take the input matrix and multiply with Transpose of input matrix

attention_scores = inputs @ inputs.T
print(attention_scores)


tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [12]:
attention_weights = torch.softmax(attention_scores, dim=1)
print(attention_weights)

# When you have a tensor with multiple dimensions (e.g., a matrix or higher-dimensional tensor), dim determines the axis along which the softmax is computed.

# The value dim=-1 means the softmax is applied along the last dimension of the tensor, regardless of how many dimensions it has. This is common in attention mechanisms
# where you want to compute probabilities for each token in a sequence. Example: If attention_scores is a 2D tensor of shape [batch_size, sequence_length]: dim=-1 applies softmax 
# along the sequence_length axis, ensuring the probabilities for each sequence sum to 1. This ensures that each set of scores (per sequence or per row) is normalized correctly.

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [13]:
print(" all row sum: ", attention_weights.sum(dim=1))

 all row sum:  tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [14]:
# So in the last step we will simply use these attn weights to compute the Context Vector
all_context_vector = attention_weights @ inputs # dot  product simply
print(all_context_vector)
# It gives us 6 context tokens

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


------------------------------------------------
# Self Attention Mechanism with trainable weights
------------------------------------------------

In [15]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [16]:
# we will start by defining a few variables:

# A - the second input element
# B - The input embdd size, d_in=3
# C - The output embdd size, d_out=2

x_2 = inputs[1] # A
d_in = inputs.shape[1] # B
d_out = 2 # C

In [17]:
# Next we will initialize random 3 weight matrices 
# Wq, Wk, Wv

torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)


# torch.nn.Parameter: Wraps a tensor to indicate it is a parameter of the model

# torch.rand(d_in, d_out): Creates a tensor of shape (d i_n ,d_out) filled with random values drawn from a uniform distribution [ 0 , 1 ) [0,1).
#                          These tensors represent weight matrices for transforming inputs into query, key, and value vectors.

# requires_grad=False: Disables gradient computation for these parameters. This means these weights won’t be updated during backpropagation. 
#                      Typically, requires_grad=True is used if these are learnable weights.

print(W_query)

print(W_key)

print(W_value)

# These are random values

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


## STEP 1 : Convert the input matrix into Key, Query and Value matrix

In [18]:
# Now we will obtain Key, Query and value matrix via matrix multiplication:

keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

print("keys: ", keys)
print("keys: ", keys.shape)

print("queries: ", queries)
print("queries: ", queries.shape)

print("values: ", values)
print("values: ", values.shape)

# Dont forget the W_key, w_value, w_keys are the random values we have created, which we now have dot multiplied it with our input vectors rn



keys:  tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
keys:  torch.Size([6, 2])
queries:  tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])
queries:  torch.Size([6, 2])
values:  tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])
values:  torch.Size([6, 2])


## STEP 2:  we find the attention scores
We do this by using the key and the query


In [19]:
# What we do here is basically look at the query value for a perticular token, then try calculating its attention scores with other Keys...
# And the unscaled attention score is calculated as a dot product between query and the key vector

In [20]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value 
print(query_2)

tensor([0.4306, 1.4551])


In [21]:
attention_score_2 =  query_2 @ keys.T # calculates the dot product between query_2 and all the Keys values which we had calculated by dot multiplying the input vector and W_key
                                      # We take the transpose because the Keys matrix has 6 rows and 2 colmns, which we cannot multiply directly (query_2 has 1*2), 
                                      # So we use the transpose to make it 2 rows and 6 colmns which make it elibile to get multiplied
print(attention_score_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [22]:
attn_scores = queries @ keys.T # omega
                               # Here we just multiplied all the queries with the key-transposed to get the attention score, ie.,
                               # the first row gives us the attention score between the first query and all the other keys
                               # the second row gives us the attention score between the second query and all the other keys
                               # bla bla bla
print(attn_scores)

tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


## Step 3 : Calculate attention weights, Normalization

In [23]:
# We compute the attention weights by scaling the attention scores and using the softmax function we used earlier.
# The difference to earlier is that we now scale the attention scores by dividing them by the square root of the embedding dimension of the keys.
# Note that taking the square root is mathematically the same as exponentiating by 0.5:

d_keys = keys.shape[-1] # keys.shape: Returns the shape (dimensions) of the keys tensor as a tuple.
                        # shape[-1]: -1 refers to the last dimension of the tensor. 
                        # -1 because we are looking at the column
attn_weights_2 = torch.softmax(attention_score_2 / d_keys**0.5, dim=-1) # Right now we are just considering the second word and calculating its weights
print(attn_weights_2)
print(attn_weights_2.shape)
print(d_keys)
print(keys.shape)



tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
torch.Size([6])
2
torch.Size([6, 2])


### Why divide by sqrt (dimention)

Reason 1: For stability in learning

The softmax function is sensitive to the magnitudes of its inputs. When the inputs are large, the differences between the exponential values of each input become much more pronounced. This causes the softmax output to become "peaky," where the highest value receives almost all the probability mass, and the rest receive very little.

In attention mechanisms, particularly in transformers, if the dot products between query and key vectors become too large (like multiplying by 8 for example), the attention scores can become very large. This results in a very sharp softmax distribution, making the model overly confident in one particular "key." Such sharp distributions can make learning unstable

In [24]:
# Example of why to divide by sqrt

import torch

# Define the tensor
tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

# Apply softmax without scaling
softmax_result = torch.softmax(tensor, dim=-1)
print("Softmax without scaling:", softmax_result)

# Multiply the tensor by 8 and then apply softmax
scaled_tensor = tensor * 8 # We are just taking an example
softmax_scaled_result = torch.softmax(scaled_tensor, dim=-1)
print("Softmax after scaling (tensor * 8):", softmax_scaled_result)

Softmax without scaling: tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax after scaling (tensor * 8): tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


### BUT WHY SQRT?

Reason 2: To make the variance of the dot product stable.
- The dot product of Q and K increases the variance because multiplying two random numbers increases the variance.
- The increase in variance grows with the dimension.
- Dividing by sqrt (dimension) keeps the variance close to 1

Variance is a measure of how spread out or different the numbers in a dataset are from their average (mean).

The reason for making the variance close to 1 is that if u do not, it will make the learning very unstable.

In [25]:
# Example for using a sqrt

import numpy as np

# Function to compute variance before and after scaling
def compute_variance(dim, num_trials=1000):
    dot_products = []
    scaled_dot_products = []

    # Generate multiple random vectors and compute dot products 
    for _ in range(num_trials):
        q = np.random.randn(dim)
        k = np.random.randn(dim)
        
        # Compute dot product
        dot_product = np.dot(q, k)
        dot_products.append(dot_product)
        
        # Scale the dot product by sqrt(dim)
        scaled_dot_product = dot_product / np.sqrt(dim)
        scaled_dot_products.append(scaled_dot_product)
    
    # Calculate variance of the dot products
    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)

    return variance_before_scaling, variance_after_scaling

# For dimension 5
variance_before_5, variance_after_5 = compute_variance(5)
print(f"Variance before scaling (dim=5): {variance_before_5}")
print(f"Variance after scaling (dim=5): {variance_after_5}")

# For dimension 20
variance_before_100, variance_after_100 = compute_variance(100)
print(f"Variance before scaling (dim=100): {variance_before_100}")
print(f"Variance after scaling (dim=100): {variance_after_100}")

# As u can see from the result, no matter how much u increase the dimensions, the Variance is alwasys close to 1, if you divide the dot-product by sqrt of its dimension

Variance before scaling (dim=5): 5.141298924941413
Variance after scaling (dim=5): 1.0282597849882826
Variance before scaling (dim=100): 89.2469177247625
Variance after scaling (dim=100): 0.8924691772476248


# STEP 4 : Final Step : Computing the Context Vector
this can simply be done by multiplying the attention weights we just calculated with the "value tensor"


In [26]:
context_vector_2 = attn_weights_2 @ values # We are rn considering the second vector
print(context_vector_2) 

tensor([0.3061, 0.8210])


# Implementing a compact self attention python class

In [27]:
import torch.nn as nn # torch.nn is a module in PyTorch that provides pre-defined building blocks for constructing and training neural networks. 
                      # These components include layers, activation functions, loss functions, and utilities to define deep learning models efficiently.

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out): # d_in, d_out are the two dimension, input and output, in GPT like LLMs, the d_in = d_out
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out)) # initializing randomly like above
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))
    
    # STEP 1 - In forward pass, we basically "create" query, key, and value
    def forward(self, x): # x is an input embdd vector
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        # STEP 2 - calculating the attention scores
        attn_scores = queries @ keys.T # reason for using transpose, look above
        # STEP 3 - calculating the attention softmax, Normalization
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1 # reason to do : 1) to keep the values in attn weight matrix small (2) variance is nearby 1
        )

        # STEP 4 - calculating context vector
        context_vector = attn_weights @ values
        return context_vector





In [28]:
torch.manual_seed(123)
sv_v1 = SelfAttention_v1(d_in, d_out)
print(sv_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [29]:
# since inputts contans 6 embdd vectors, we got a matrix storing the 6 context vectors

We can improve the SelfAttention_v1 implementation further by utilizing PyTorch's nn.Linear layers, which effectively perform matrix multiplication when the bias units are disabled.

Additionally, a significant advantage of using nn.Linear instead of manually implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight initialization scheme, contributing to more stable and effective model training.

In [30]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_keys = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_keys(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [31]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [32]:
# Note that SelfAttention_v1 and SelfAttention_v2 give different outputs because they use different initial weights for the weight matrices since
# nn.Linear uses a more sophisticated weight initialization scheme.

---
# hiding future words with causal attention
---

In [33]:
queries = sa_v2.W_query(inputs) # A
keys = sa_v2.W_keys(inputs)
attn_scores = queries @ keys.T
attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim = 1)
print(attn_weights)

tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [34]:
context_length = attn_scores.shape[0] # Extracts the size of the first dimension (number of rows) of the tensor attn_scores.
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

# torch.ones(context_length, context_length): -
# Creates a square matrix of shape (context_length, context_length) filled with ones.

# torch.tril(...): -
# torch.tril stands for "lower triangular." (torch.triu is used for "upper traingle")
# It takes the square matrix of ones and zeroes out all elements above the main diagonal.
# The resulting matrix has ones in the lower triangular part and zeros elsewhere.


tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [35]:
masked_simple = attn_weights*mask_simple
print(masked_simple)

tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.0000, 0.0000],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<MulBackward0>)


In [36]:
# GOod but these cant be our attn weights as each row does not sum upto 1, hence we need so normalize it again
# we simply take the sum of each row and divide the element with the sum
row_sum = masked_simple.sum(dim=1, keepdim=True)
masked_simple_normal = masked_simple / row_sum
print(masked_simple_normal)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<DivBackward0>)


In [37]:
# Now although this is good but this cant serve our purpose as although we have our attn_weightt matrix (masked_simple_normal) but we had already utilised softmax func which 
# contradicts the usecase of causal attention mechanism

# Hence we use a different approach to resolve this issue

In [38]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
print(masked)


tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)


1. torch.ones(context_length, context_length): This creates a 2D tensor filled with ones, where the dimensions are determined by context_length. This tensor will be used to define the mask.
torch.triu(..., diagonal=1): This function extracts the upper triangular part of the tensor, including the diagonal starting from the first diagonal above the main diagonal. 
In simpler terms, it creates a mask where elements below the first diagonal are set to zero.

2. masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
mask.bool(): Converts the tensor mask into a boolean tensor, where elements with a value of 1 become True and elements with a value of 0 become False.
attn_scores.masked_fill(...): This operation modifies the attn_scores tensor based on the mask.
For elements in attn_scores where the corresponding element in mask.bool() is True, the value in attn_scores is replaced with -torch.inf.
This effectively masks out the unwanted parts of the attention scores, preventing them from influencing the attention mechanism.

In [40]:
# and now we take simply take the softmax function to these masked results, and we are done.

attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


## Masking attention weights with dropout

In [41]:
# we use a dropout rate of 50% which means masking out half of the attention weights
# GPT model uses a lower dropout rate of around 10%, 20%


In [42]:
# example
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)
example = torch.ones(6, 6)
print(dropout(example))

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])


 When applying dropout to an attention weight matrix with a rate of 50%, half of the elements in the matrix are randomly set to zero.

To compensate for the reduction in active elements, the values of the remaining elements in the matrix are scaled up by a factor of 1/0.5 =2.

This scaling is crucial to maintain the overall balance of the attention weights, 
ensuring that the average influence of the attention mechanism remains consistent during both the training and inference phases.

In [44]:
# applying dropout to attn weights
torch.manual_seed(123)
print(dropout(attn_weights))

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.8966, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6206, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4921, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4350, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3327, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


### causal attention class

Before we begin, one more thing is to ensure that the code can handle batches consisting of more than one input.

This will ensure that the CausalAttention class supports the batch outputs produced by the data loader we implemented earlier.

For simplicity, to simulate such batch inputs, we duplicate the input text example:

2 inputs with 6 tokens each, and each token has embedding dimension 3

In [46]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape) 

# this results in a 3D tensor with 2 input text with 6 tokens each, where each token is 3 dimentional embdd vector

torch.Size([2, 6, 3])


In [47]:
# CausalAttention is simialar to SelfAttention except that now we are going to add dropout and causal mask component

class CasualAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False): # qkv_bias is a parameter that controls whether bias terms are 
                                                                              # included in the linear transformations for Query (Q), Key (K), and Value (V) vectors.
                                                                              # When qkv_bias=True, the linear layers for Q, K, and V include this additional bias term 𝑏 b, 
                                                                              # which can provide additional flexibility in the learned representations.
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # We are adding this new when compared with selfAttention class
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # This is for masking

    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension - b
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights) # New

        context_vec = attn_weights @ values
        return context_vec


The use of register_buffer in PyTorch is not strictly necessary for all use cases but offers several advantages here.

For instance, when we use the CausalAttention class in our LLM, buffers are automatically moved to the appropriate device (CPU or GPU) along with our model, 
which will be relevant when training the LLM in future chapters.

This means we don't need to manually ensure these tensors are on the same device as your model parameters, avoiding device mismatch errors.

Apply the causal mask to the attention scores to block future token dependencies.
`self.mask.bool()` converts the mask to a boolean matrix, where `True` means the corresponding position should be masked.
`[:num_tokens, :num_tokens]` ensures the mask is adjusted for sequences shorter than the maximum context length.
`masked_fill_` sets the masked positions in `attn_scores` to `-torch.inf`, effectively removing their influence during the softmax step.
attn_scores.masked_fill_(
    self.mask.bool()[:num_tokens, :num_tokens], -torch.inf
)


In [49]:
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CasualAttention(d_in, d_out, context_length, 0.0)
context_vector = ca(batch)
print(context_vector)


tensor([[[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]],

        [[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]]], grad_fn=<UnsafeViewBackward0>)


-----
# Implemmention Multihead Attention Mechanism
-----

In [50]:
# in real life, implemention multihead attn mech involves clearning multiple instances of the attn-mechanism, each with its own weights and then combining their outputs

# To do that, we will be stacking multiple instances of our previously created causalAttention class  

In [51]:
class MultiHeadAttentionWrapper(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CasualAttention(d_in, d_out, context_length, dropout, qkv_bias) 
             for _ in range(num_heads)]
        )

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)


self.head creates a list of CausalAttention layers using nn.ModuleList.
nn.ModuleList ensures that all the layers are properly registered as part of the model.
Each CausalAttention layer is initialized with the given parameters (d_in, d_out, context_length, dropout, qkv_bias).
The for loop creates 'num_head' instances, implementing the multi-head attention mechanism,
where each head learns to focus on different parts of the input.

For example, if we use this MultiHeadAttentionWrapper class with two attention heads (via num_heads=2) and CausalAttention output dimension d_out=2, 
this results in a 4- dimensional context vectors (d_out*num_heads=4)

In [53]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape) 
print(batch) 




torch.Size([2, 6, 3])
tensor([[[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]],

        [[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]]])


In [54]:
torch.manual_seed(123)
context_length = batch.shape[1] # we are assuming it to be 6, this is the number of tokens 
                                # context length refers to the number of tokens in the input sequence that a model can "see" or process at once.
d_in, d_out = 3, 2
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("Context_vectors_shape:", context_vecs.shape)


tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],

        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
Context_vectors_shape: torch.Size([2, 6, 4])


Understanding the shape of the context vector (2, 6, 4)
2 because number of batches we defined = 2
6 because we have 6 rows, and we have context vector for each token
But if we look at the columns, it is 4,
It is 4 because we aggregated together 2 batches, which individually had 2 columns itself, so 2*2 = 4 columns (d_out)

BTW just to clarify, these values are absolutly randomly used, later on we will train them and update the values

Also, currently whats happening is we are calculating the contet vector one by one, not simuntaneously which results in some errors for long values, 


-----
# Implementing multi head attention with weight split
-----

Instead of maintaining two separate classes, MultiHeadAttentionWrapper and CausalAttention, we can combine both of these concepts into a single MultiHeadAttention class.
Also, in addition to just merging the MultiHeadAttentionWrapper with the CausalAttention code, we will make some other modifications to implement multi-head attention more efficiently.

In the MultiHeadAttentionWrapper, multiple heads are implemented by creating a list of CausalAttention objects (self.heads), each representing a separate attention head.
The CausalAttention class independently performs the attention mechanism, and the results from each head are concatenated.

In contrast, the following MultiHeadAttention class integrates the multi-head functionality within a single class.
It splits the input into multiple heads by reshaping the projected query, key, and value tensors and then combines the results from these heads after computing attention.

In [58]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        
        self.d_out = d_out # We have to define
        self.num_heads = num_heads # We have to define
        self.head_dim = d_out // num_heads # // operator divides 2 no. and truncates the results to the nearer whole number

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out) # final linear transformation applied to the concatenated multi-head outputs, ensuring they match the expected output dimension (d_out) and providing additional learnable parameters for flexibility.
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x): # x is the input tensor to the attention mechanism, containing token embeddings for a batch of sequences.
       
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # The transformation applies a matrix multiplication between the input x and the weight matrix of the linear layer, adds the bias, and projects each token into the Key space. This operation is repeated for all tokens in the sequence.
        queries = self.W_query(x) # same as above
        values = self.W_value(x) # same as above

        # We implicitly split the matrix by adding a 'num_heads' dimension
        # Unroll last dimension: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        # We use keys.view to reshape the keys tensor
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # changed from 3 to 4 dim
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) # same thing
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) # again same thing

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean  
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens] # mask.bool is used to block attention to certain positions
        
        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) # implementing softmax and dividing by sqrt of head_dim
        attn_weights = self.dropout(attn_weights) # adding dropout currently 0

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # The reason for transposing is to get re arrange the shape of the vecctor back in the form
        
        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec




- Step 1: Reduce the projection dim to match desired output dim

- Step 2: Use a Linear layer to combine head outputs

- Step 3: Tensor shape: (b, num_tokens, d_out)

- Step 4: We implicitly split the matrix by adding a num_heads dimension. Then we unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)

- Step 5: Transpose from shape (b, num_tokens, num_heads, head_dim) to (b, num_heads, num_tokens, head_dim)

- Step 6: Compute dot product for each head

- Step 7: Mask truncated to the number of tokens

- Step 8: Use the mask to fill attention scores

- Step 9: Tensor shape: (b, num_tokens, n_heads, head_dim)

- Step 10: Combine heads, where self.d_out = self.num_heads * self.head_dim

- Step 11: Add an optional linear projection

In [60]:
# Even though the reshaping (.view) and transposing (.transpose) of tensors inside the MultiHeadAttention class looks very complicated, mathematically, 
# the MultiHeadAttention class implements the same concept as the MultiHeadAttentionWrapper earlier.


# On a big-picture level, in the previous MultiHeadAttentionWrapper, we stacked multiple single-head attention layers that we combined into a multi-head attention layer.
# The MultiHeadAttention class takes an integrated approach.
# It starts with a multi-head layer and then internally splits this layer into individual attention heads

## DETAILED EXPLANATION OF THE MULTI-HEAD ATTENTION CLASS

The splitting of the query, key, and value tensors, is achieved through tensor reshaping and transposing operations using PyTorch's .view and .transpose methods.

The input is first transformed (via linear layers for queries, keys, and values) and then reshaped to represent multiple heads.

The key operation is to split the d_out dimension into num_heads and head_dim, where head_dim = d_out / num_heads.

This splitting is then achieved using the .view method: a tensor of dimensions (b, num_tokens, d_out) is reshaped to dimension (b, num_tokens, num_heads, head_dim).

The tensors are then transposed to bring the num_heads dimension before the num_tokens dimension, resulting in a shape of (b, num_heads, num_tokens, head_dim).

This transposition is crucial for correctly aligning the queries, keys, and values across the different heads and performing batched matrix multiplications efficiently.

To illustrate this batched matrix multiplication, suppose we have the following example tensor:

In [61]:
a = torch.tensor([[[[0.2745, 0.6584, 0.2775, 0.8573], #A
[0.8993, 0.0390, 0.9268, 0.7388],
[0.7179, 0.7058, 0.9156, 0.4340]],
[[0.0772, 0.3565, 0.1479, 0.5331],
[0.4066, 0.2318, 0.4545, 0.9737],
[0.4606, 0.5159, 0.4220, 0.5786]]]])

The shape of this tensor is (b, num_heads, num_tokens, head_dim) = (1, 2, 3, 4)

Now, we perform a batched matrix multiplication between the tensor itself and a view of the tensor where we transposed the last two dimensions, num_tokens and head_dim:

In [62]:
print(a @ a.transpose(2, 3))

tensor([[[[1.3208, 1.1631, 1.2879],
          [1.1631, 2.2150, 1.8424],
          [1.2879, 1.8424, 2.0402]],

         [[0.4391, 0.7003, 0.5903],
          [0.7003, 1.3737, 1.0620],
          [0.5903, 1.0620, 0.9912]]]])


In this case, the matrix multiplication implementation in PyTorch handles the 4-dimensional input tensor so that the matrix multiplication is carried out between the 2 last dimensions (num_tokens, head_dim) and then repeated for the individual heads.

For instance, the above becomes a more compact way to compute the matrix multiplication for each head separately:

In [63]:
first_head = a[0, 0, :, :]
first_res = first_head @ first_head.T
print("First head:\n", first_res)
second_head = a[0, 1, :, :]
second_res = second_head @ second_head.T
print("\nSecond head:\n", second_res)

First head:
 tensor([[1.3208, 1.1631, 1.2879],
        [1.1631, 2.2150, 1.8424],
        [1.2879, 1.8424, 2.0402]])

Second head:
 tensor([[0.4391, 0.7003, 0.5903],
        [0.7003, 1.3737, 1.0620],
        [0.5903, 1.0620, 0.9912]])


The results are exactly the same results that we obtained when using the batched matrix multiplication print(a @ a.transpose(2, 3)) earlier:

Continuing with MultiHeadAttention, after computing the attention weights and context vectors, the context vectors from all heads are transposed back to the shape (b, num_tokens, num_heads, head_dim).

These vectors are then reshaped (flattened) into the shape (b, num_tokens, d_out), effectively combining the outputs from all heads

Additionally, we added a so-called output projection layer (self.out_proj) to MultiHeadAttention after combining the heads, which is not present in the CausalAttention class.

This output projection layer is not strictly necessary, but it is commonly used in many LLM architectures, which is why we added it here for completeness.

Even though the MultiHeadAttention class looks more complicated than the MultiHeadAttentionWrapper due to the additional reshaping and transposition of tensors, it is more efficient.

The reason is that we only need one matrix multiplication to compute the keys, for instance, keys = self.W_key(x) (the same is true for the queries and values).

In the MultiHeadAttentionWrapper, we needed to repeat this matrix multiplication, which is computationally one of the most expensive steps, for each attention head.

The MultiHeadAttention class can be used similar to the SelfAttention and CausalAttention classes we implemented earlier:

In [64]:
torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


As we can see based on the results, the output dimension is directly controlled by the d_out argument:

In this section, we implemented the MultiHeadAttention class that we will use in the upcoming sections when implementing and training the LLM itself.

Note that while the code is fully functional, we used relatively small embedding sizes and numbers of attention heads to keep the outputs readable.

For comparison, the smallest GPT-2 model (117 million parameters) has 12 attention heads and a context vector embedding size of 768.

The largest GPT-2 model (1.5 billion parameters) has 25 attention heads and a context vector embedding size of 1600.

Note that the embedding sizes of the token inputs and context embeddings are the same in GPT models (d_in = d_out).