# From Scratch build a transformer architecture decoder block
We are going to achieve the same as (70%?maybe)  
"""  
from transformers import BertModel  
model = BertModel.from_pretrained('bert-base-uncased')  
"""  
Let's start with a naive version first!

In [1]:
import torch 
import torch.nn as nn
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        pass
    def __forward(self):
        pass

In order to build a module with pytorch, the preceding code is the basic structure.

In [None]:
import torch 
import torch.nn as nn
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  # to set the 'parameter' dimention is 768
    def __forward(self, x):
        """
        input tensor x: nxd
        output tensor out:nxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

In the transformer architecture, we have to let the input go through 2 layers: SelfAttention layer and Feed Forward Network.  
Then let's build the SelfAttention layer first.

In [None]:
import torch 
import torch.nn as nn
import math
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.Wv = nn.Linear(self.dim, self.dim, bias=False)
        self.lm = nn.LayerNorm(self.dim)
        
    def SelfAttention(self, x):
        """
        input x: nxd
        output nxd
        """
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        AttentionScore = torch.mm(Q, K.transpose(0, 1)) / math.sqrt(self.dim)
        AttentionWeights = nn.Softmax(dim=1)(AttentionScore)
        AttentionOutput = torch.mm(AttentionWeights, V)
        output = self.lm(x + AttentionOutput)
        return output

    def __forward(self, x):
        """
        input tensor x: nxd
        output tensor out:nxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

Here we have built the SelfAttention layer.

1. Linear Transformations:
   - $Q = W_q(x)$
   - $K = W_k(x)$
   - $V = W_v(x)$

2. Attention Score Calculation:
   - ${AttentionScore} = {Q \cdot K^T}{\sqrt{dim}}$

3. Softmax Activation:
   - $AttentionWeights = Softmax(AttentionScore)$

4. Weighted Sum of Values:
   - $AttentionOutput = AttentionWeights \cdot V$

5. Residual Connection and Layer Normalization:
   - $output = lm(x + AttentionOutput)$

In [None]:
import torch 
import torch.nn as nn
import math
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.Wv = nn.Linear(self.dim, self.dim, bias=False)
        self.lm = nn.LayerNorm(self.dim)
        self.ffn1 = nn.Linear(self.dim, self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4, self.dim)
        self.act = nn.GELU()
        self.lm_ffn = nn.LayerNorm(self.dim)

    def SelfAttention(self, x):
        """
        input x: nxd
        output nxd
        """
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        AttentionScore = torch.mm(Q, K.transpose(0, 1)) / math.sqrt(self.dim)
        AttentionWeights = nn.Softmax(dim=1)(AttentionScore)
        AttentionOutput = torch.mm(AttentionWeights, V)
        AttentionOutput = self.lm(x + AttentionOutput)
        return AttentionOutput
    
    def FFN(self, x):
        hidden = self.ffn1(x)
        hidden = self.act(hidden)
        output = self.ffn2(hidden)
        output = self.lm_ffn(x + output)
        return output

    def __forward(self, x):
        """
        input tensor x: nxd
        output tensor out:nxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

Here we have built the Feed Forward Layer.  
1. Linear Transformations and Activation Function:
   - $hidden = W_1(x)+b_1$   mapping from dimension dim to dim*4
   - $hidden = GELU(hidden)$ 
   - $output = W_2(hidden)+b_2$   mapping from dimension dim*4 to dim

2. Residual Connection and Layer Normalization:
   - $output = lm_{ffn}(x + output)$  use another ffn layer becuase of using another set of parameters 

Now we have a naive version of transformer architecture.  
Let's take a look what we are lacking for a true transformer architecture.  
- batch
- dropout 
- multi-headattention
- attention mask / padding mask

In [None]:
 #batch
import torch 
import torch.nn as nn
import math
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.Wv = nn.Linear(self.dim, self.dim, bias=False)
        self.lm = nn.LayerNorm(self.dim)
        self.ffn1 = nn.Linear(self.dim, self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4, self.dim)
        self.act = nn.GELU()
        self.lm_ffn = nn.LayerNorm(self.dim)

    def SelfAttention(self, x):
        """
        input x: bxnxd
        output bxnxd
        """
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        AttentionScore = torch.bmm(Q, K.transpose(1, 2)) / math.sqrt(self.dim) #
        AttentionWeights = nn.Softmax(dim=2)(AttentionScore)
        AttentionOutput = torch.bmm(AttentionWeights, V) #
        AttentionOutput = self.lm(x + AttentionOutput)
        return AttentionOutput
    
    def FFN(self, x):
        hidden = self.ffn1(x)
        hidden = self.act(hidden)
        output = self.ffn2(hidden)
        output = self.lm_ffn(x + output)
        return output

    def __forward(self, x):
        """
        input x: bxnxd
        output out:bxnxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

In [None]:
# drop out
import torch 
import torch.nn as nn
import math
class NaiveTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.att_drop_prob = 0.1 #
        self.state_drop_prob = 0.5 #
        self.Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.Wv = nn.Linear(self.dim, self.dim, bias=False)
        self.lm = nn.LayerNorm(self.dim)
        self.ffn1 = nn.Linear(self.dim, self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4, self.dim)
        self.act = nn.GELU()
        self.lm_ffn = nn.LayerNorm(self.dim)
        self.att_drop = nn.Dropout(self.att_drop_prob) #
        self.state_drop = nn.Dropout(self.state_drop_prob) #

    def SelfAttention(self, x):
        """
        input x: bxnxd
        output bxnxd
        """
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        AttentionScore = torch.bmm(Q, K.transpose(1, 2)) / math.sqrt(self.dim)
        AttentionWeights = nn.Softmax(dim=2)(AttentionScore)
        AttentionWeights = self.att_drop(AttentionWeights) #
        AttentionOutput = torch.bmm(AttentionWeights, V)
        AttentionOutput = self.state_drop(AttentionOutput) #
        AttentionOutput = self.lm(x + AttentionOutput)
        return AttentionOutput
    
    def FFN(self, x):
        hidden = self.ffn1(x)
        hidden = self.act(hidden)
        output = self.ffn2(hidden)
        output = self.state_drop(output) #
        output = self.lm_ffn(x + output)
        return output

    def __forward(self, x):
        """
        input x: bxnxd
        output out:bxnxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

Now, we have done the work for batch and drop out.  
Let's go to deal with multi-head attention

In [None]:
import torch 
import torch.nn as nn
import math
class MultiTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.att_drop_prob = 0.1
        self.state_drop_prob = 0.5
        self.num_heads = 12 #
        self.size_per_head = self.dim // self.num_heads # 64
        self.Wq = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.Wk = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.Wv = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.map = nn.Linear(self.num_heads * self.size_per_head, self.dim)
        self.lm = nn.LayerNorm(self.dim)
        self.ffn1 = nn.Linear(self.dim, self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4, self.dim)
        self.act = nn.GELU()
        self.lm_ffn = nn.LayerNorm(self.dim)
        self.att_drop = nn.Dropout(self.att_drop_prob)
        self.state_drop = nn.Dropout(self.state_drop_prob)

    def SelfAttention(self, x):
        """
        input x: bxnxd
        Q, K, V : bxnx(hxs) -> bxnxhxs -> bxhxnxs 
        output bxnxd
        """
        new_size = x.size()[:-1] + (self.num_heads, self.size_per_head) # b, n, h, s
        Q = self.Wq(x).view(*new_size).permute(0, 2, 1, 3) #
        K = self.Wk(x).view(*new_size).permute(0, 2, 1, 3) #
        V = self.Wv(x).view(*new_size).permute(0, 2, 1, 3) #
        AttentionScore = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.dim)
        AttentionWeights = nn.Softmax(dim=3)(AttentionScore)
        AttentionWeights = self.att_drop(AttentionWeights)
        AttentionOutput = torch.matmul(AttentionWeights, V)
        AttentionOutput = AttentionOutput.permute(0, 2, 1, 3) # bxhxnxs -> bxnxhxs
        AttentionOutput = self.map(AttentionOutput) # bxnxhxs -> bxnxd
        AttentionOutput = self.state_drop(AttentionOutput)
        AttentionOutput = self.lm(x + AttentionOutput)
        return AttentionOutput
    
    def FFN(self, x):
        hidden = self.ffn1(x)
        hidden = self.act(hidden)
        output = self.ffn2(hidden)
        output = self.state_drop(output)
        output = self.lm_ffn(x + output)
        return output

    def __forward(self, x):
        """
        input x: bxnxd
        output out:bxnxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out

Thank you for keep going with me to here! We are very closed to complish our coding. So far, we have already got almost every part of a transformer decoder block. The only thing we left here is attention mask / padding mask. Let's finish it!

In [5]:
import torch 
import torch.nn as nn
import math
class MultiTransformerLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dim = 768  
        self.att_drop_prob = 0.1
        self.state_drop_prob = 0.5
        self.num_heads = 12 #
        self.size_per_head = self.dim // self.num_heads # 64
        self.Wq = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.Wk = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.Wv = nn.Linear(self.dim, self.num_heads * self.size_per_head, bias=False) #
        self.map = nn.Linear(self.num_heads * self.size_per_head, self.dim)
        self.lm = nn.LayerNorm(self.dim)
        self.ffn1 = nn.Linear(self.dim, self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4, self.dim)
        self.act = nn.GELU()
        self.lm_ffn = nn.LayerNorm(self.dim)
        self.att_drop = nn.Dropout(self.att_drop_prob)
        self.state_drop = nn.Dropout(self.state_drop_prob)

    def calc_mask_score(self, attention_mask):
        """
        input bxn
        output bxhxnxn
        """
        mask_score = torch.zeros(attention_mask.size(0), self.num_heads, attention_mask.size(1), attention_mask.size(1))
        mask_score = mask_score + attention_mask[:, None, None, :]
        mask_score = (1.0 - mask_score) * -10000.
        return mask_score

    def SelfAttention(self, x, attention_mask):
        """
        input x: bxnxd
        Q, K, V : bxnx(hxs) -> bxnxhxs -> bxhxnxs
        attention_mask: bxn
            1 -> normal token
            0 -> masked token 
        output bxnxd
        """
        new_size = x.size()[:-1] + (self.num_heads, self.size_per_head) # b, n, h, s
        Q = self.Wq(x).view(*new_size).permute(0, 2, 1, 3) #
        K = self.Wk(x).view(*new_size).permute(0, 2, 1, 3) #
        V = self.Wv(x).view(*new_size).permute(0, 2, 1, 3) #
        AttentionScore = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.dim)
        AttentionScore = AttentionScore + self.calc_mask_score(attention_mask)
        AttentionWeights = nn.Softmax(dim=3)(AttentionScore)
        AttentionWeights = self.att_drop(AttentionWeights)
        AttentionOutput = torch.matmul(AttentionWeights, V)
        AttentionOutput = AttentionOutput.permute(0, 2, 1, 3) # bxhxnxs -> bxnxhxs
        AttentionOutput = self.map(AttentionOutput) # bxnxhxs -> bxnxd
        AttentionOutput = self.state_drop(AttentionOutput)
        AttentionOutput = self.lm(x + AttentionOutput)
        return AttentionOutput
    
    def FFN(self, x):
        hidden = self.ffn1(x)
        hidden = self.act(hidden)
        output = self.ffn2(hidden)
        output = self.state_drop(output)
        output = self.lm_ffn(x + output)
        return output

    def __forward(self, x):
        """
        input x: bxnxd
        output out:bxnxd
        """
        x = self.SelfAttention(x)
        x = self.FFN(x)
        out = x
        return out