# Coding Self-Attention in PyTroch!!!


In [1]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.module() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax()

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [11]:
class SelfAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, column_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias=False)
        self.W_k = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        self.W_v = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        
        self.row_dim = row_dim
        self.column_dim = column_dim

    def forward(self, token_encoding):
        q= self.W_q(token_encoding)
        k= self.W_k(token_encoding)
        v= self.W_v(token_encoding)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim,
                                           dim1=self.column_dim))
        scaled_sims = sims / torch.tensor(k.size(self.column_dim)**0.5)
        attention_percents = F.softmax(scaled_sims, dim=self.column_dim)
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores


In [None]:
encoding_matrix = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])
print(f"encoding_matrix:\n ",encoding_matrix)

torch.manual_seed(42)
selfAttention = SelfAttention(d_model=2, row_dim=0, column_dim=1)

selfAttention(encoding_matrix)

encoding_matrix:
  tensor([[ 1.1600,  0.2300],
        [ 0.5700,  1.3600],
        [ 4.4100, -2.1600]])


tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [14]:
selfAttention.W_q.weight.transpose(0,1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)

# Coding Masked Self-Attention in PyTroch!!!


In [None]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.module() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax()

In [17]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, column_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias=False)
        self.W_k = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        self.W_v = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        
        self.row_dim = row_dim
        self.column_dim = column_dim

    def forward(self, token_encoding, mask=None):
        q = self.W_q(token_encoding)
        k = self.W_k(token_encoding)
        v = self.W_v(token_encoding)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim,
                                           dim1=self.column_dim))
        
        scaled_sims = sims / torch.tensor(k.size(self.column_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask,
                                                  value=-1e9)

        attention_percents = F.softmax(scaled_sims, dim=self.column_dim)
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores


In [None]:
encoding_matrix = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])
print(f"encoding_matrix:\n ",encoding_matrix)

torch.manual_seed(42)
maskedSelfAttention = MaskedSelfAttention(d_model=2, row_dim=0, column_dim=1)
mask = torch.tril(torch.ones(3,3))
mask=mask==0

maskedSelfAttention(encoding_matrix, mask)

encoding_matrix:
  tensor([[ 1.1600,  0.2300],
        [ 0.5700,  1.3600],
        [ 4.4100, -2.1600]])


tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

# Coding Encode-Decoder Attention and Multi-Head Attention

In [1]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.module() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax()

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
class Attention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, column_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias=False)
        self.W_k = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        self.W_v = nn.Linear(in_features= d_model,
                             out_features= d_model,
                             bias=False)
        
        self.row_dim = row_dim
        self.column_dim = column_dim

    def forward(self,
                encoding__for_q,
                encoding__for_k,
                encoding__for_v,
                mask=None):
        q = self.W_q(encoding__for_q)
        k = self.W_k(encoding__for_k)
        v = self.W_v(encoding__for_v)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim,
                                           dim1=self.column_dim))
        
        scaled_sims = sims / torch.tensor(k.size(self.column_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask,
                                                  value=-1e9)

        attention_percents = F.softmax(scaled_sims, dim=self.column_dim)
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

In [3]:
encoding_for_q = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])
encoding_for_k = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])
encoding_for_v = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

torch.manual_seed(42)
attention = Attention(d_model=2, row_dim=0, column_dim=1)
mask = torch.tril(torch.ones(3,3))
mask=mask==0

attention(encoding_for_q,
          encoding_for_k,
          encoding_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [4]:
# Now implement multihead class
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2, num_heads=2, row_dim=0, column_dim=1):

        super().__init__()

        self.heads = nn.ModuleList(
            [Attention(d_model,
                      row_dim,
                      column_dim)
            for _ in range(num_heads)])
        self.column_dim = column_dim

    def forward(self,
                encoding__for_q,
                encoding__for_k,
                encoding__for_v,
                ):

        return torch.cat([head(encoding__for_q,
                                 encoding__for_k,
                                 encoding__for_v,
                                 )
                          for head in self.heads],
                         dim=self.column_dim)

In [None]:
#test with one head to get previous results
torch.manual_seed(42)
multiHeadAttention=MultiHeadAttention(d_model=2, num_heads=1, row_dim=0, column_dim=1)
multiHeadAttention(encoding_for_q,
                   encoding_for_k,
                   encoding_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)

In [6]:
# now with multi heads
torch.manual_seed(42)
multiHeadAttention=MultiHeadAttention(d_model=2, num_heads=2, row_dim=0, column_dim=1)
multiHeadAttention(encoding_for_q,
                   encoding_for_k,
                   encoding_for_v)  

tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
        [ 0.2040,  0.7057, -0.7417, -0.9193],
        [ 3.4989,  2.2427, -0.7190, -0.8447]], grad_fn=<CatBackward0>)