In [1]:
import torch 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from torch import nn
from torch.nn import functional as F

In [3]:
# self attention
class self_attention(nn.Module):
    '''
    Module to apply self attention to an input sequence of vectors
    
    parameters:
    
    emb_dim = dimension of the embedding vector
    h = number of self attention heads
    
    '''
    def __init__(self, emb_dim, h):
        super().__init__()
        self.emb_dim = emb_dim
        self.h = h
        self.red_vec_size = emb_dim//h
        
        # Querry vector
        self.WQ = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        self.WK = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        self.WV = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        
    def forward(self, x):
        # x has shape (batch_size, seq_len, emb_dim)
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        querries = self.WQ(x)
        keys = self.WK(x)
        values = self.WV(x)
        att_scores = F.softmax((querries@keys.permute(0,2,1)).permute(0,2,1)\
                               /np.sqrt(self.red_vec_size), dim = 2)
        ctx_vecs = att_scores @ values 
        assert ctx_vecs.shape == (batch_size, seq_len, self.red_vec_size ) 
        return querries, keys, values, ctx_vecs

In [4]:
batch_size = 5
seq_len = 3
emb_dim = 4
h = 1
x = torch.randn((batch_size, seq_len, emb_dim))
attn = self_attention(emb_dim, h)

In [5]:
attn

self_attention(
  (WQ): Linear(in_features=4, out_features=4, bias=False)
  (WK): Linear(in_features=4, out_features=4, bias=False)
  (WV): Linear(in_features=4, out_features=4, bias=False)
)

In [6]:
querries, keys, values, ctx_vecs = attn(x)

In [7]:
querries.shape, keys.shape, values.shape, ctx_vecs.shape

(torch.Size([5, 3, 4]),
 torch.Size([5, 3, 4]),
 torch.Size([5, 3, 4]),
 torch.Size([5, 3, 4]))

In [8]:
ctx_vecs

tensor([[[ 0.0695,  0.1027, -0.1141, -0.0260],
         [ 0.1070,  0.0183, -0.1150,  0.0011],
         [ 0.1134, -0.0138, -0.1107,  0.0092]],

        [[-0.1350,  0.8512,  0.0235, -0.4540],
         [-0.1487,  0.8075,  0.0171, -0.4436],
         [-0.1452,  0.8116,  0.0182, -0.4431]],

        [[ 0.1837, -0.4192, -0.0492,  0.3628],
         [ 0.2332, -0.3694, -0.0935,  0.3753],
         [ 0.2078, -0.3799, -0.0827,  0.3814]],

        [[ 0.2248, -0.1568,  0.0170,  0.0817],
         [ 0.2360, -0.1490,  0.0138,  0.0803],
         [ 0.2438, -0.1497,  0.0096,  0.0782]],

        [[-0.2843, -0.2287,  0.2798,  0.0842],
         [-0.2868, -0.2681,  0.2785,  0.1049],
         [-0.2441, -0.1056,  0.2685,  0.0219]]], grad_fn=<UnsafeViewBackward>)

In [9]:
attn(x)[3]

tensor([[[ 0.0695,  0.1027, -0.1141, -0.0260],
         [ 0.1070,  0.0183, -0.1150,  0.0011],
         [ 0.1134, -0.0138, -0.1107,  0.0092]],

        [[-0.1350,  0.8512,  0.0235, -0.4540],
         [-0.1487,  0.8075,  0.0171, -0.4436],
         [-0.1452,  0.8116,  0.0182, -0.4431]],

        [[ 0.1837, -0.4192, -0.0492,  0.3628],
         [ 0.2332, -0.3694, -0.0935,  0.3753],
         [ 0.2078, -0.3799, -0.0827,  0.3814]],

        [[ 0.2248, -0.1568,  0.0170,  0.0817],
         [ 0.2360, -0.1490,  0.0138,  0.0803],
         [ 0.2438, -0.1497,  0.0096,  0.0782]],

        [[-0.2843, -0.2287,  0.2798,  0.0842],
         [-0.2868, -0.2681,  0.2785,  0.1049],
         [-0.2441, -0.1056,  0.2685,  0.0219]]], grad_fn=<UnsafeViewBackward>)

In [10]:
class multi_head_attn(nn.Module):
    '''
    Module to create multiple attention heads
    
    parameters:
    
    emb_dim = dimension of the embedding vectors
    h = number of attention heads
    parallelize = parallelize the computations for differnt heads 
    
    '''
    def __init__(self, emb_dim, h, parallelize = 'False'):
        super().__init__()
        self.emb_dim = emb_dim
        self.h = h
        self.red_vec_size = emb_dim // h 
        
        self.heads = [self_attention(emb_dim, h) for i in range(h)]
        
        # transform the contatenated context vectors to have same size as emb_sim
        # this is to be able to enable implement a skip-connection between the input and output
        self.Wo = nn.Linear(self.red_vec_size*h, emb_dim, bias = False) 
        
        # layer norm
        # should we apply 
        self.LNorm = nn.LayerNorm(emb_dim)
        
    def forward(self, x):
        ctx_vecs = torch.cat([head(x)[3] for head in self.heads], dim = 2)
        transformed = self.Wo(ctx_vecs)
        
        return self.LNorm(x + transformed)

In [11]:
batch_size = 5
seq_len = 3
emb_dim = 6
h = 2
x = torch.randn((batch_size, seq_len, emb_dim))
multihead = multi_head_attn(emb_dim, h)

In [12]:
ctx = multihead(x)

In [13]:
ctx.shape

torch.Size([5, 3, 6])

In [14]:
ctx

tensor([[[-0.3970, -0.2556, -0.4025, -1.2867,  0.3908,  1.9510],
         [-1.0594,  0.7616,  1.2213, -1.5387,  0.6606, -0.0454],
         [-1.9424,  0.8693, -0.0473, -0.4243,  0.9915,  0.5532]],

        [[ 0.3326,  0.4952, -2.1718,  0.8860,  0.0920,  0.3660],
         [ 0.9578, -1.8795, -0.5415, -0.0991,  0.8968,  0.6655],
         [-0.9096, -1.4197,  0.7999,  1.3254,  0.7101, -0.5061]],

        [[-0.1232,  0.9751,  0.1593,  0.8051,  0.2561, -2.0724],
         [-0.1782, -0.4761,  0.8225,  0.6757,  1.0362, -1.8801],
         [-0.8773,  0.5034, -0.4839, -0.2464, -0.8748,  1.9791]],

        [[ 0.1125, -1.9037,  0.3175, -0.1553,  1.4896,  0.1396],
         [-1.5095,  0.8640, -1.1197, -0.0211,  0.6430,  1.1433],
         [-1.2516,  0.8050,  0.4536, -1.5385,  0.9071,  0.6243]],

        [[ 0.7306, -1.5146, -1.0940,  0.0747,  1.3162,  0.4870],
         [ 1.7558,  0.5619, -0.2374, -0.4283, -0.1200, -1.5320],
         [-0.0811,  0.6744,  1.7611, -1.0020, -0.1673, -1.1851]]],
       grad_fn=

In [15]:
multihead

multi_head_attn(
  (Wo): Linear(in_features=6, out_features=6, bias=False)
  (LNorm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
)

In [16]:
list(multihead.LNorm.parameters())

[Parameter containing:
 tensor([1., 1., 1., 1., 1., 1.], requires_grad=True), Parameter containing:
 tensor([0., 0., 0., 0., 0., 0.], requires_grad=True)]

In [17]:
ctx.mean(dim = 2)

tensor([[ 0.0000e+00,  9.3132e-09, -9.9341e-09],
        [ 0.0000e+00, -9.9341e-09,  1.9868e-08],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.4835e-09, -1.9868e-08, -2.9802e-08],
        [ 1.4901e-08, -5.9605e-08,  1.5895e-07]], grad_fn=<MeanBackward1>)

In [18]:
ctx.std(dim = 2)

tensor([[1.0954, 1.0954, 1.0954],
        [1.0954, 1.0954, 1.0954],
        [1.0954, 1.0954, 1.0954],
        [1.0954, 1.0954, 1.0954],
        [1.0954, 1.0954, 1.0954]], grad_fn=<StdBackward1>)

In [19]:
class encoder(nn.Module):
    '''
    The complete encoder module.
    
    parameters:
    
    emb_dim = dimension of the embedding vectors
    h = number of attention heads
    parallelize = parallelize the computations for differnt heads 
    ffn_l1_out_fts = number of out_features of 1st layer in feed forward NN. Default is 2048 a suggested in the original paper
    
    
    '''
    
    def __init__(self, emb_dim, h, parallelize = False, ffn_l1_out_fts = 2048 ):
        super().__init__()
        self.emb_dim = emb_dim
        self.h = h
        self.red_vec_size = emb_dim//h
        
        # multi_head_attention sub-layer
        self.mul_h_attn = multi_head_attn(emb_dim, h, parallelize)
        
        # feedforward sublayers
        self.l1 = nn.Linear(emb_dim, ffn_l1_out_fts)
        self.l2 = nn.Linear(ffn_l1_out_fts, emb_dim)
        
        # layer norm
        self.LNorm = nn.LayerNorm(emb_dim) 
        
    def forward(self, x):
        ctx_vecs = self.mul_h_attn(x)
        out = torch.relu(self.l1(ctx_vecs))
        out = self.l2(out)
        
        return self.LNorm(out + x)
            

In [20]:
batch_size = 5
seq_len = 3
emb_dim = 6
h = 2
x = torch.randn((batch_size, seq_len, emb_dim))
enc = encoder(emb_dim, h)

In [21]:
enc

encoder(
  (mul_h_attn): multi_head_attn(
    (Wo): Linear(in_features=6, out_features=6, bias=False)
    (LNorm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
  )
  (l1): Linear(in_features=6, out_features=2048, bias=True)
  (l2): Linear(in_features=2048, out_features=6, bias=True)
  (LNorm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
)

In [22]:
enc_out = enc(x)
enc_out

tensor([[[ 0.6807, -0.4590,  1.8553, -0.4325, -0.4071, -1.2374],
         [ 1.1996,  0.5036, -1.2591, -0.8528,  1.1819, -0.7732],
         [ 0.4374, -1.4284, -1.1448,  0.5949,  0.0936,  1.4473]],

        [[-0.6799, -0.3918,  0.5790, -1.6221,  1.3587,  0.7562],
         [ 0.5610, -1.5596,  0.0128, -1.0685,  1.0122,  1.0421],
         [ 0.2855, -0.9815,  1.1326,  0.3345, -1.6628,  0.8918]],

        [[-1.1680, -1.2111, -0.1682,  1.6213,  0.2591,  0.6670],
         [-1.5072, -1.1199,  0.5933,  1.0595, -0.0252,  0.9994],
         [ 1.0912, -0.6212, -1.1809, -0.9823,  0.2850,  1.4082]],

        [[ 1.0300,  0.4345, -2.1187,  0.4893,  0.1470,  0.0179],
         [-0.2882, -0.8012, -0.0468, -1.2432,  1.8592,  0.5202],
         [ 1.1637, -1.4063,  0.8195, -0.6638,  0.9243, -0.8375]],

        [[-1.2617,  0.1805,  1.5193, -0.1089, -1.1649,  0.8356],
         [ 1.7452, -0.0804, -0.6855,  0.6709, -0.2481, -1.4021],
         [-1.6552,  0.5707,  0.2026,  0.8890,  1.0219, -1.0290]]],
       grad_fn=

In [23]:
enc_out.shape

torch.Size([5, 3, 6])

In [75]:
class encoder_decoder_attention(nn.Module):
    '''
    Module to implement the encoder_decoder attention layer. 
    This is same as the self_attention layer except that it takes two input vectors: 
                 1)encoder's final output 
                 2) output from previous decoder layer
    The querries are generated from the previous decoder layer's output
    The keys and the values are generated from the encoder's output 
         
    '''
    def __init__(self, emb_dim, h):
        super().__init__()
        
        self.emb_dim = emb_dim
        self.h = h
        self.red_vec_size = emb_dim//h
        
        # Querry vector
        self.WQ = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        # Key vector
        self.WK = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        # Value vector
        self.WV = nn.Linear(emb_dim, self.red_vec_size, bias = False)
        
    def forward(self, enc_out, dec_out):
        # x has shape (batch_size, seq_len, emb_dim)
        batch_size = enc_out.shape[0]
        seq_len = enc_out.shape[1]
        querries = self.WQ(dec_out)
        keys = self.WK(enc_out)
        values = self.WV(enc_out)
        att_scores = F.softmax((querries@keys.permute(0,2,1))\
                               /np.sqrt(self.red_vec_size), dim = 2)
        ctx_vecs = att_scores @ values 
        assert ctx_vecs.shape == (batch_size, seq_len, self.red_vec_size ) 
        return querries, keys, values, att_scores, ctx_vecs

In [76]:
batch_size = 5
seq_len = 4
emb_dim = 6
h = 2
enc_out = torch.randn((batch_size, seq_len, emb_dim))
dec_out = torch.randn(batch_size, seq_len, emb_dim)
enc_dec_attn = encoder_decoder_attention(emb_dim, h)
enc_dec_attn

encoder_decoder_attention(
  (WQ): Linear(in_features=6, out_features=3, bias=False)
  (WK): Linear(in_features=6, out_features=3, bias=False)
  (WV): Linear(in_features=6, out_features=3, bias=False)
)

In [77]:
q, k, v, s, c = enc_dec_attn(enc_out, dec_out)

In [78]:
q.shape, k.shape, v.shape, s.shape, c.shape

(torch.Size([5, 4, 3]),
 torch.Size([5, 4, 3]),
 torch.Size([5, 4, 3]),
 torch.Size([5, 4, 4]),
 torch.Size([5, 4, 3]))

In [79]:
q1 = q[0,0]
q1

tensor([0.1055, 1.0679, 0.7241], grad_fn=<SelectBackward>)

In [80]:
keys = k[0]
keys.shape

torch.Size([4, 3])

In [81]:
q1@keys.T

tensor([-2.2760, -0.6922, -0.5533, -0.9880], grad_fn=<SqueezeBackward3>)

In [82]:
q @ k.permute(0,2,1)

tensor([[[-2.2760, -0.6922, -0.5533, -0.9880],
         [ 0.0452, -0.1738, -0.6367,  1.0138],
         [-0.2108, -0.0797, -0.5803,  0.6809],
         [ 0.8407,  0.5261,  1.1788, -1.1274]],

        [[-0.0912,  0.8676, -0.7004, -0.8413],
         [-0.0349, -0.4209,  0.2272,  0.2838],
         [-0.2665,  1.3260, -0.7065, -0.8829],
         [-0.1835, -1.0982,  0.4689,  0.6033]],

        [[-1.6119,  0.5404, -0.3440,  0.3651],
         [-0.4196,  0.3210, -0.4347,  0.3786],
         [ 0.1711, -0.1909,  0.4034, -0.3576],
         [-0.4332,  0.1890,  0.4023, -0.3989]],

        [[-0.1977, -0.1933,  0.1216, -0.4201],
         [-0.1057, -0.7515, -0.7439,  0.2262],
         [-0.0205, -0.6539, -1.1184,  0.0809],
         [-0.3543, -0.2760,  1.5729,  0.3774]],

        [[-0.2560, -0.2947, -0.5404, -0.0393],
         [-0.3528, -0.3723,  1.6815, -0.4538],
         [ 0.5769,  0.3728,  0.1997,  0.2831],
         [ 0.1172,  0.3279,  0.6438, -0.0655]]], grad_fn=<UnsafeViewBackward>)

In [83]:
(q @ k.permute(0,2,1)).permute(0,2,1)

tensor([[[-2.2760,  0.0452, -0.2108,  0.8407],
         [-0.6922, -0.1738, -0.0797,  0.5261],
         [-0.5533, -0.6367, -0.5803,  1.1788],
         [-0.9880,  1.0138,  0.6809, -1.1274]],

        [[-0.0912, -0.0349, -0.2665, -0.1835],
         [ 0.8676, -0.4209,  1.3260, -1.0982],
         [-0.7004,  0.2272, -0.7065,  0.4689],
         [-0.8413,  0.2838, -0.8829,  0.6033]],

        [[-1.6119, -0.4196,  0.1711, -0.4332],
         [ 0.5404,  0.3210, -0.1909,  0.1890],
         [-0.3440, -0.4347,  0.4034,  0.4023],
         [ 0.3651,  0.3786, -0.3576, -0.3989]],

        [[-0.1977, -0.1057, -0.0205, -0.3543],
         [-0.1933, -0.7515, -0.6539, -0.2760],
         [ 0.1216, -0.7439, -1.1184,  1.5729],
         [-0.4201,  0.2262,  0.0809,  0.3774]],

        [[-0.2560, -0.3528,  0.5769,  0.1172],
         [-0.2947, -0.3723,  0.3728,  0.3279],
         [-0.5404,  1.6815,  0.1997,  0.6438],
         [-0.0393, -0.4538,  0.2831, -0.0655]]], grad_fn=<PermuteBackward>)

In [92]:
scores1 = F.softmax((q1@keys.T/np.sqrt(3)), dim = 0)
scores1

tensor([0.1204, 0.3005, 0.3256, 0.2534], grad_fn=<SoftmaxBackward>)

In [86]:
s[0]

tensor([[0.1204, 0.3005, 0.3256, 0.2534],
        [0.2323, 0.2047, 0.1567, 0.4063],
        [0.2193, 0.2366, 0.1772, 0.3670],
        [0.2967, 0.2474, 0.3606, 0.0952]], grad_fn=<SelectBackward>)

In [93]:
scores1.sum()

tensor(1., grad_fn=<SumBackward0>)

In [89]:
v[0]

tensor([[-0.4755,  0.7773,  2.0798],
        [ 0.1360,  0.4734,  0.2271],
        [ 0.0423, -0.0171,  1.2723],
        [-0.2565,  0.5834,  1.0827]], grad_fn=<SelectBackward>)

In [94]:
scores1 @ v[0]

tensor([-0.0676,  0.3781,  1.0074], grad_fn=<SqueezeBackward3>)

In [95]:
c[0]

tensor([[-0.0676,  0.3781,  1.0074],
        [-0.1802,  0.5118,  1.1689],
        [-0.1587,  0.4935,  1.1326],
        [-0.1166,  0.3971,  1.2352]], grad_fn=<SelectBackward>)

In [96]:
q2 = q[0,1]
q2

tensor([ 0.5455, -1.0743,  0.2637], grad_fn=<SelectBackward>)

In [98]:
scores2 = F.softmax((q2@keys.T/np.sqrt(3)), dim = 0)
scores2

tensor([0.2323, 0.2047, 0.1567, 0.4063], grad_fn=<SoftmaxBackward>)

In [99]:
scores2@v[0]

tensor([-0.1802,  0.5118,  1.1689], grad_fn=<SqueezeBackward3>)

In [100]:
qq1 = q[1,0]
keys = k[1]
scores = F.softmax((qq1@keys.T/np.sqrt(3)), dim = 0)
scores@v[1]

tensor([-0.3518,  0.0418,  0.4753], grad_fn=<SqueezeBackward3>)

In [101]:
c[1]

tensor([[-0.3518,  0.0418,  0.4753],
        [-0.4657,  0.1529,  0.5513],
        [-0.3086, -0.0077,  0.4688],
        [-0.4950,  0.1789,  0.5791]], grad_fn=<SelectBackward>)