In [1]:
import torch 
from torch import nn
from torch.nn import functional
import numpy as np

In [26]:
class SelfAttentionLayer(nn.Module):
    
    def __init__(self, emb_size, key_size, value_size):
        """
        This layer will take B x E as input and return 
        B X 1 as output.
        """
        super(SelfAttentionLayer, self).__init__()        
        self.emb_size = emb_size
        
        self.query_w = nn.Parameter(data = torch.Tensor(emb_size, key_size), requires_grad=True)
        self.key_w = nn.Parameter(data = torch.Tensor(emb_size, key_size), requires_grad=True)
        self.value_w = nn.Parameter(data = torch.Tensor(emb_size, value_size), requires_grad=True)
        
        self.query_w.data.uniform_(-1, 1)
        self.key_w.data.uniform_(-1, 1)
        self.value_w.data.uniform_(-1, 1)
        
    def forward(self, X):
        # Created Latent vectors
        query = X @ self.query_w
        key = X @ self.key_w
        value = X @ self.value_w
        
        # Step 1: Take dot product query x key
        query_key = query @ key.T
        
        # Step 2: Scale Down 
        query_key = query_key / np.sqrt(self.emb_size)
        
        # Step 3: Softmax 
        query_key = functional.softmax(query_key)
        
        # Step 4: Dot product with value
        z = query_key @ value 
        return z

In [29]:
attention = SelfAttentionLayer(100, 64, 128)

In [30]:
attention(torch.randn(1, 100))

  query_key = functional.softmax(query_key)


tensor([[ -5.0843,   9.4798,   1.0584,  11.8069,   3.5864,   4.8195,   1.7043,
          -7.1996,  -6.0194,  -0.6405,  -6.3124,  -8.5351,   3.1925,   5.4533,
          -0.8233,  -0.4762,   4.7229,  10.0604,  -1.3713,   9.5223,  -4.7797,
           4.7757,   0.1745,   4.2864,  -2.9410,  -0.5733,  -5.9982,   8.1013,
           6.7948,  -6.3240,  -0.7363,   0.2341, -10.4797,  -0.3215,   0.7258,
         -12.1739,   5.3160,   6.6976,  -0.1905,   5.4955,  -3.4306,  -5.4653,
          11.9070,  -2.7301,   9.8852,   4.4781,  -6.9414,   8.9627,   4.1580,
          -3.0651,  -2.2756,  -3.5140,   1.1860,   6.0247,  -2.8935,  12.2305,
           6.0747,  -4.7928,   4.2265,   0.7376,  -2.5694,   3.5744,   2.0789,
          -5.5691,   7.3652,   3.6363,   9.6546,  -9.6473,   3.9384,  -4.7080,
           3.1698,  -6.0063,   2.4592,   0.5230,  -0.6025,  -9.3222,  -1.8954,
           2.2409, -11.9186,   2.7639,  -1.0580,   0.5306,   7.4277,  -0.5028,
           4.7346,   7.2282,   7.7415,   0.5907,  -8

In [97]:
class FFNLayer(nn.Module):
    
    def __init__(self, n_layers, in_shape, out_shape, hidden_size=128):
        super(FFNLayer, self).__init__()
        self.l1 = nn.Linear(in_shape, hidden_size)
        self.ff_list = nn.ModuleList([])
        for x in range(1, n_layers - 1):
            self.ff_list.append(nn.Linear(hidden_size, hidden_size))
        self.out = nn.Linear(hidden_size, out_shape)
        
    def forward(self, X):
        
        x = self.l1(X)
        for layer in self.ff_list:
            x = functional.relu_(layer(x))
        return self.out(x)
        

In [98]:
ff_layer = FFNLayer(5, 32, 64)
ff_layer(torch.randn(1, 32))


tensor([[ 0.0704, -0.0370, -0.0603, -0.0845,  0.0380, -0.0399, -0.0572,  0.1146,
         -0.0387, -0.0401, -0.0482,  0.0070, -0.0465,  0.0010,  0.0293, -0.1117,
          0.0036,  0.0379, -0.0833,  0.0997,  0.0606,  0.0984,  0.0821,  0.0417,
         -0.0312, -0.0207,  0.0182,  0.0750, -0.0110, -0.0030,  0.0473,  0.0360,
         -0.0024,  0.0658, -0.0347, -0.1117, -0.0241,  0.0497,  0.0816,  0.0238,
         -0.0602,  0.0186, -0.0024, -0.0583, -0.0557, -0.0231,  0.0231,  0.0183,
          0.0404,  0.0948, -0.1180,  0.0753, -0.0825,  0.0212, -0.0092, -0.0043,
         -0.1148,  0.1018, -0.0180,  0.0516, -0.0192,  0.0816,  0.0371,  0.0541]],
       grad_fn=<AddmmBackward0>)

# Multi-Head Self Attention Layer

In [80]:
class MultiHeadAttentionLayer(nn.Module):
    
    def __init__(self, n_heads, emb_size, key_size, value_size):
        
        super(MultiHeadAttentionLayer, self).__init__()
        
        assert n_heads > 0, "Heads must be greater than or equal to 1"
        
        self.n_heads = n_heads
        self.emb_size = emb_size
        self.key_size = key_size
        self.value_size = value_size
        self.latent_shape = emb_size/n_heads
        
        self.attention_heads = []
        for i in range(n_heads):
            qw = nn.Parameter(data = torch.Tensor(emb_size, key_size), requires_grad=True)
            qw.data.uniform_(-1, 1)
            
            kw = nn.Parameter(data = torch.Tensor(emb_size, key_size), requires_grad=True)
            kw.data.uniform_(-1, 1)
            
            vw = nn.Parameter(data = torch.Tensor(emb_size, value_size), requires_grad=True)
            vw.data.uniform_(-1, 1)
            
            self.attention_heads.append({"qw": qw, "kw": kw, "vw": vw})
            
        # Output Weights will be of size (heads*value_dimension, embed_shape)
        self.output_weight = nn.Parameter(data = torch.Tensor(np.int16(self.n_heads * value_size), emb_size), requires_grad=True)
        self.output_weight.data.uniform_(-1, 1)
        
    
    def forward(self, X):

        output = []
        for head in self.attention_heads:  
            qw, kw, vw = head["qw"], head["kw"], head["vw"]
            query = X @ qw
            key = X @ kw
            value = X @ vw
            query_key = query @ key.T
            query_key = functional.softmax(query_key / np.sqrt(self.emb_size)) @ value
            output.append(query_key)
        return torch.hstack(output) @ self.output_weight
        
            

In [81]:
attention = MultiHeadAttentionLayer(3, 128, 128, 64)

In [82]:
attention(torch.randn(10, 128))

  query_key = functional.softmax(query_key / np.sqrt(self.emb_size)) @ value


tensor([[-49.1984,  56.5596, -17.9829,  ..., -47.5858,  34.1168, -39.6175],
        [  4.5766, -43.2742,   6.3936,  ...,   3.8078, -63.8746,  96.5679],
        [-52.6093, -29.1833, -42.1872,  ..., -32.5948, -64.6483, -59.6851],
        ...,
        [-19.1623, -75.5371,  -9.5427,  ...,   8.1399, -38.0727, -57.0843],
        [-17.4054, -46.4293,  -1.0944,  ...,  33.5287, -23.9791, -28.7532],
        [-40.0549, -10.8629, -19.8245,  ...,  -6.3601,  33.6071, -38.8377]],
       grad_fn=<MmBackward0>)

# Encoder Layer

In [91]:
class Encoder(nn.Module):
    
    def __init__(self, emb_shape, out_shape, key_shape, value_shape, n_attention_heads = 1, ffn_layers = 3):
        
        super(Encoder, self).__init__()
        self.emb_shape = emb_shape
        self.out_shape = out_shape
        self.fft_layers = ffn_layers
        
        self.attention_layer = MultiHeadAttentionLayer(n_attention_heads, emb_shape, key_shape, value_shape)
        self.fft = FFNLayer(ffn_layers, emb_shape, out_shape)
        self.norm_layer1 = nn.LayerNorm(emb_shape)
        self.norm_layer2 = nn.LayerNorm(out_shape)
    
    def forward(self, X):
        
        """
        X: It will be positional encoded vector
        """
        attention_resp = self.attention_layer(X)
        
        # Summing and normalizing Attention response and Position Vectors
        summed_attention = attention_resp + X
        normalized_resp = self.norm_layer1(summed_attention)
        
        # Getting FFT Response
        fft_resp = self.fft(normalized_resp)
        
        # Sum and Normalize
        fft_resp = normalized_resp + fft_resp
        return self.norm_layer2(fft_resp)

In [92]:
encoder = Encoder(128, 128, 64, 128)

encoder(torch.randn(10, 128))

  query_key = functional.softmax(query_key / np.sqrt(self.emb_size)) @ value


tensor([[ 0.3327,  0.4116, -0.9861,  ..., -2.3578,  0.1315, -0.7881],
        [-1.1328,  0.3602, -0.7221,  ...,  2.2564, -0.3207, -0.1107],
        [-0.2970, -0.2225,  0.1057,  ...,  2.0895,  0.5040, -0.1919],
        ...,
        [-0.3056, -0.2487,  0.0823,  ...,  2.0816,  0.5152, -0.1851],
        [-0.3353, -0.2736,  0.0922,  ...,  2.0629,  0.5189, -0.1954],
        [ 0.5763,  0.5753,  0.6267,  ...,  0.2336, -0.0938, -0.5007]],
       grad_fn=<NativeLayerNormBackward0>)