## ---------------------------Casual Attention(Masking)-----------------------------

## Getting attention scores(borrowing from self attention)

![image.png](attachment:0521c485-99b3-4aa8-85ab-71b149f62e5a.png)

In [26]:
# Creating the token embeddings - Randomized
import torch
output_dim = 3

inputs = torch.tensor([
    [0.43, 0.15, 0.89], # Your    # X1
    [0.55, 0.87, 0.66], # journey # x2
    [0.57, 0.85, 0.64], # begins  # X3
    [0.22, 0.58, 0.33], # with    # X4
    [0.77, 0.25, 0.10], # one     # X5
    [0.05, 0.80, 0.55] # step     # X6
])  
inputs

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])

In [27]:
def getAttentionWeights(inputs, d_in, d_out):
    key_w = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)
    query_w = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)
    value_w = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)
    keys = inputs @ key_w
    queries = inputs @ query_w
    values = inputs @ value_w
    attention_scores = queries @ keys.T
    scaled_attention_scores = attention_scores / keys.shape[-1]**0.5
    attention_weights = torch.softmax(scaled_attention_scores, dim=-1)
    return attention_weights, attention_scores, keys.shape[-1]
attention_weights, attention_scores, dimension_keys = getAttentionWeights(inputs, 3, 2)
attention_weights, attention_scores, dimension_keys

(tensor([[0.1336, 0.2200, 0.2157, 0.1414, 0.1111, 0.1782],
         [0.1260, 0.2444, 0.2375, 0.1257, 0.0911, 0.1752],
         [0.1264, 0.2433, 0.2366, 0.1264, 0.0920, 0.1754],
         [0.1465, 0.2094, 0.2061, 0.1440, 0.1210, 0.1731],
         [0.1443, 0.2024, 0.1997, 0.1503, 0.1276, 0.1757],
         [0.1415, 0.2225, 0.2180, 0.1361, 0.1091, 0.1728]]),
 tensor([[0.8373, 1.5428, 1.5144, 0.9172, 0.5769, 1.2445],
         [1.3321, 2.2685, 2.2281, 1.3286, 0.8737, 1.7979],
         [1.3110, 2.2370, 2.1972, 1.3107, 0.8609, 1.7739],
         [0.7639, 1.2695, 1.2472, 0.7399, 0.4936, 1.0003],
         [0.5620, 1.0412, 1.0220, 0.6196, 0.3886, 0.8409],
         [1.0151, 1.6554, 1.6265, 0.9610, 0.6484, 1.2984]]),
 2)

In [28]:
# Confirming normality

torch.allclose(attention_weights[0].sum(), torch.tensor(1.))

True

## Getting the masked matrix

```python
# Getting the masked attention scores
mask = torch.tril(torch.ones(context_length, context_length))
masked_attention_scores = mask * attention_weights

# Getting the masked attention weights
row_sums = masked_attention_scores.sum(dim=1, keepdim=True)
masked_attention_weights = masked_attention_scores / row_sums


In [29]:
# Getting the upper triangular matrix as zero and the rest as ones

context_length = attention_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
mask_simple

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [30]:
# Multiplication of the attention weights with the mask simple background

masked_attention_scores = attention_scores * mask_simple
masked_attention_scores

tensor([[0.8373, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [1.3321, 2.2685, 0.0000, 0.0000, 0.0000, 0.0000],
        [1.3110, 2.2370, 2.1972, 0.0000, 0.0000, 0.0000],
        [0.7639, 1.2695, 1.2472, 0.7399, 0.0000, 0.0000],
        [0.5620, 1.0412, 1.0220, 0.6196, 0.3886, 0.0000],
        [1.0151, 1.6554, 1.6265, 0.9610, 0.6484, 1.2984]])

In [31]:
# Formula for normalizing masked attention scores - check llm book for reference

masked_attention_weights = masked_attention_scores
row_sums = [lis.sum() for lis in masked_attention_weights]

for i, lis in enumerate(masked_attention_weights):
    for k, val in enumerate(lis):
        masked_attention_weights[i, k] = masked_attention_weights[i, k] / row_sums[i]
torch.allclose(masked_attention_weights[0].sum(), torch.tensor(1.))

True

In [32]:
# Simpler formula

# Getting the row sums
row_sums = masked_attention_scores.sum(dim=-1, keepdim=True)
print(row_sums)
# Getting the attention weights
masked_attention_weights = masked_attention_scores / row_sums
print(masked_attention_weights)
# Testing the normality of the masked attention weights
torch.allclose(masked_attention_weights[0].sum(), torch.tensor(1.))

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3700, 0.6300, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2282, 0.3894, 0.3824, 0.0000, 0.0000, 0.0000],
        [0.1900, 0.3158, 0.3102, 0.1840, 0.0000, 0.0000],
        [0.1547, 0.2866, 0.2813, 0.1705, 0.1069, 0.0000],
        [0.1409, 0.2298, 0.2258, 0.1334, 0.0900, 0.1802]])


True

## Upper triangular Infinity mask

In [33]:
attention_scores.shape

torch.Size([6, 6])

In [34]:
# Creating the mask - upper triangle = -infinity

context_length = attention_scores.shape[0]
mask = torch.triu(torch.ones(context_length, context_length), diagonal = 1)
mask

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])

In [35]:
# Getting the masked attention scores

masked_attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
masked_attention_scores

tensor([[0.8373,   -inf,   -inf,   -inf,   -inf,   -inf],
        [1.3321, 2.2685,   -inf,   -inf,   -inf,   -inf],
        [1.3110, 2.2370, 2.1972,   -inf,   -inf,   -inf],
        [0.7639, 1.2695, 1.2472, 0.7399,   -inf,   -inf],
        [0.5620, 1.0412, 1.0220, 0.6196, 0.3886,   -inf],
        [1.0151, 1.6554, 1.6265, 0.9610, 0.6484, 1.2984]])

In [36]:
# Scaling the masked attention scores with the dimension of the keys

scaled_masked_attention_scores = masked_attention_scores / dimension_keys**0.5
scaled_masked_attention_scores

tensor([[0.5920,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.9419, 1.6041,   -inf,   -inf,   -inf,   -inf],
        [0.9270, 1.5818, 1.5537,   -inf,   -inf,   -inf],
        [0.5402, 0.8977, 0.8819, 0.5232,   -inf,   -inf],
        [0.3974, 0.7362, 0.7227, 0.4381, 0.2748,   -inf],
        [0.7178, 1.1705, 1.1501, 0.6795, 0.4585, 0.9181]])

In [37]:
# Normalizing the masked attention scores to get the attention weights

masked_attention_weights = torch.softmax(scaled_masked_attention_scores, dim=-1)
masked_attention_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3403, 0.6597, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2085, 0.4013, 0.3902, 0.0000, 0.0000, 0.0000],
        [0.2075, 0.2966, 0.2920, 0.2040, 0.0000, 0.0000],
        [0.1750, 0.2456, 0.2423, 0.1823, 0.1548, 0.0000],
        [0.1415, 0.2225, 0.2180, 0.1361, 0.1091, 0.1728]])

## Implementing Dropout Casual Attention - 50 %

In [38]:
# Creating a dropout layer

dropout = torch.nn.Dropout(0.5)

In [39]:
# Example - Demonstration

torch.manual_seed(42)
matrix = torch.ones(6, 6)
dropout(matrix)

tensor([[0., 0., 2., 2., 2., 2.],
        [2., 0., 2., 0., 2., 0.],
        [0., 0., 2., 2., 2., 0.],
        [2., 2., 0., 2., 0., 2.],
        [2., 0., 2., 2., 2., 2.],
        [2., 2., 2., 0., 2., 0.]])

In [40]:
# Applying dropout layer to our masked_attention weights

dropout(masked_attention_weights)

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6805, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4170, 0.0000, 0.7804, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5932, 0.0000, 0.4079, 0.0000, 0.0000],
        [0.3500, 0.4912, 0.4846, 0.0000, 0.0000, 0.0000],
        [0.2829, 0.0000, 0.0000, 0.0000, 0.0000, 0.3457]])

## Implementing compact Casual Attention class

In [41]:
batch = torch.stack((inputs, inputs), dim=0)
batch

tensor([[[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]],

        [[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]]])

In [42]:
# Two batches each with 6 X 3
# Two sentences each with six words and 3 dimensions

batch.shape

torch.Size([2, 6, 3])

In [57]:
# Casual Attention Class
from torch import nn
class CasualAttentionV1(nn.Module):
    def __init__(self, d_in, d_out, dropout_rate, context_length, bias_units=False):
        super().__init__()
        # Defining the key query value weights
        self.w_key = torch.nn.Linear(d_in, d_out, bias=bias_units)
        self.w_query = torch.nn.Linear(d_in, d_out, bias=bias_units)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=bias_units)
        self.dropout = torch.nn.Dropout(dropout_rate) # new
        # Creating the masking foundation
        self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length), diagonal=1))
    def forward(self, x):
        # Remember we are dealing with batches
        b, num_tokens, d_in = x.shape
        # Getting the key query value matrices
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)
        # Getting the attention scores - we reshape the inner dimensions in the transpose
        attention_scores = queries @ keys.transpose(1, 2)
        # Upper triangular infinity mask - modify the tensor in place
        attention_scores.masked_fill_(
            # Slicing the mask to match the current input  
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        # Scaling attention scores
        scaled_attention_scores = attention_scores / keys.shape[-1]**0.5
        # Calculating the attention weight
        attention_weights = torch.softmax(scaled_attention_scores, dim=-1)
        # Dropout layer
        attention_weights = self.dropout(attention_weights)
        # Calculating the context vectors
        context_vectors = attention_weights @ values
        return context_vectors
        

In [59]:
# Instatiation using our batch sample
batches, context_length, dimensions = batch.shape
ca = CasualAttentionV1(d_in=3, d_out=2, dropout_rate=0.1 ,context_length=context_length)
context_vectors = ca(batch)
context_vectors, context_vectors.shape

(tensor([[[-0.4893, -0.5397],
          [-0.5609, -0.6736],
          [-0.5849, -0.7253],
          [-0.5404, -0.6736],
          [-0.4644, -0.6259],
          [-0.4819, -0.6202]],
 
         [[-0.4893, -0.5397],
          [-0.5609, -0.6736],
          [-0.5849, -0.7253],
          [-0.5404, -0.6736],
          [-0.4644, -0.6259],
          [-0.4819, -0.6202]]], grad_fn=<UnsafeViewBackward0>),
 torch.Size([2, 6, 2]))