# Attention in Transformers

This notebook contains the code for attention mechanism in the transformer. This is the core building block of transformers.

>Note:This notebook is inspired from the deeplearing.ai course on "Attention in Transformers" 

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Self-Attention

In [33]:
class SelfAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        """
        Args:
            d_model: Number of embedding values per token (a.k.a. embedding dimension)
            row_dim: Batch size
            col_dim: Number of tokens in a sequence
        """
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, token_embeddings):
        q = self.W_q(token_embeddings)
        k = self.W_k(token_embeddings)
        v = self.W_v(token_embeddings)

        # Compute attention scores
        # Compute similarity scores between query and key vectors
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        # Scale the similarity scores by dividing by sqrt(dim(k))
        scaled_sims = sims/torch.tensor(k.size(self.col_dim)**0.5)

        # Apply softmax to determine attention weights
        attention_weights = F.softmax(scaled_sims, dim=self.col_dim)

        # Compute the weighted sum of the value vectors
        attention_output = torch.matmul(attention_weights, v)

        return attention_output
        

In [34]:
# Calclulate Self Attention
# create a matrix of token encodings
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

# set the seed for the random number generator
torch.manual_seed(42)

# create self attention object
self_attention = SelfAttention(d_model=2, row_dim=0, col_dim=1)

# calclulate self attention of the token encodings
self_attention(encodings_matrix)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [35]:
self_attention.W_q.weight.transpose(0,1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)

In [36]:
# print weight matrix of query
q = torch.matmul(encodings_matrix, self_attention.W_q.weight.transpose(0, 1))
print(q)

# print weight matrix of key
k = torch.matmul(encodings_matrix, self_attention.W_k.weight.transpose(0, 1))
print(k)

# print weight matrix of value
v = torch.matmul(encodings_matrix, self_attention.W_v.weight.transpose(0, 1))
print(v)

tensor([[ 0.7621, -0.0428],
        [ 1.1063,  0.7890],
        [ 1.1164, -2.1336]], grad_fn=<MmBackward0>)
tensor([[-0.1469, -0.3038],
        [ 0.1057,  0.3685],
        [-0.9914, -2.4152]], grad_fn=<MmBackward0>)
tensor([[ 0.6038,  0.7434],
        [-0.3502,  0.5303],
        [ 3.8695,  2.4246]], grad_fn=<MmBackward0>)


## Masked Self Attention

In [37]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, token_embeddings, mask=None):
        q = self.W_q(token_embeddings)
        k = self.W_k(token_embeddings)
        v = self.W_v(token_embeddings)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = sims/ torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask, -1e9)

        attention_weights = F.softmax(scaled_sims, dim=self.col_dim)

        attention_scores = torch.matmul(attention_weights, v)

        return attention_scores


In [38]:
## create a matrix of token encodings...
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create a masked self-attention object
masked_self_attention = MaskedSelfAttention(d_model=2,
                               row_dim=0,
                               col_dim=1)

## create the mask so that we don't use
## tokens that come after a token of interest
mask = torch.tril(torch.ones(3, 3))
print(mask)
mask = mask == 0
mask # print out the mask

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])


tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])

In [39]:
masked_self_attention(encodings_matrix, mask)

tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

## Attention

In [40]:
class Attention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, encodings_for_query, encodings_for_key, encodings_for_value, mask=None):
        q = self.W_q(encodings_for_query)
        k = self.W_k(encodings_for_key)
        v = self.W_v(encodings_for_value)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = sims/ torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask, -1e9)

        attention_weights = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_weights, v)

        return attention_scores

Calculate Encoder-Decoder Attention

In [50]:
## create matrices of token encodings...
encodings_for_q = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_k = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_v = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
attention = Attention(d_model=2,
                      row_dim=0,
                      col_dim=1)

## calculate encoder-decoder attention
attention(encodings_for_q, encodings_for_k, encodings_for_v, None)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

## Multi-Head Attention

In [51]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1, num_heads=2):
        super().__init__()
        # create a list of attention heads
        self.heads = nn.ModuleList([
            Attention(d_model=d_model, row_dim=row_dim, col_dim=col_dim)
            for _ in range(num_heads)
        ])
        self.col_dim = col_dim

    def forward(self, encodings_for_query, encodings_for_key, encodings_for_value, mask=None):
        # concatenate the attention outputs from each head
        return torch.cat(
            [head(encodings_for_query, encodings_for_key, encodings_for_value, mask)
             for head in self.heads],
            dim=self.col_dim
        )

Calclulate multi head attention


with 1 head

In [54]:
## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multi_head_attention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        num_heads=1)

# calculate encoder-decoder attention
multi_head_attention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)

Multihead attention with 2 heads

In [56]:
torch.manual_seed(42)

multi_head_attention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        num_heads=2)

multi_head_attention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
        [ 0.2040,  0.7057, -0.7417, -0.9193],
        [ 3.4989,  2.2427, -0.7190, -0.8447]], grad_fn=<CatBackward0>)