In [1]:
import torch
import numpy as np
import torch.nn as nn

torch.manual_seed(321)

<torch._C.Generator at 0x2787a7c95d0>

In [2]:
class InputEmbedding(nn.Module):

    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x) * np.sqrt(self.d_model)

class PositionalEncoding(nn.Module):

    def __init__(self, seq_len, d_model):
        super().__init__()

        ## Creating a temp (seq_len x embed_dim) matrix
        pe = torch.zeros((seq_len, d_model))
        self.seq_len = seq_len
        self.d_model = d_model
        ## Create 1D tensor by doing unsqueeze till range of sequence
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0) # (1, Seq_len, d_model)

        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + (self.pe[:, :x.size(1), :]).requires_grad_(False) # (batch, seq_len, d_model)


In [3]:
st = "my name is jarvis i am tony stark bot, my daily task is to assist iron man"
vocab = sorted(set(st.split()))

print(f"This is String: {st}\nThis is vocabulary: {vocab}")

vocab_id = {j:i for i, j in enumerate(vocab)}
print(f"\nThis is token to id mapping {vocab_id}")

tokens_ids = torch.tensor([vocab_id[i] for i in st.split()])
print(f"Sentence to Token ID {tokens_ids}")

This is String: my name is jarvis i am tony stark bot, my daily task is to assist iron man
This is vocabulary: ['am', 'assist', 'bot,', 'daily', 'i', 'iron', 'is', 'jarvis', 'man', 'my', 'name', 'stark', 'task', 'to', 'tony']

This is token to id mapping {'am': 0, 'assist': 1, 'bot,': 2, 'daily': 3, 'i': 4, 'iron': 5, 'is': 6, 'jarvis': 7, 'man': 8, 'my': 9, 'name': 10, 'stark': 11, 'task': 12, 'to': 13, 'tony': 14}
Sentence to Token ID tensor([ 9, 10,  6,  7,  4,  0, 14, 11,  2,  9,  3, 12,  6, 13,  1,  5,  8])


In [4]:
vocab_size = len(vocab)
d_model = 100
seq_len = 50

embed_obj = InputEmbedding(d_model, len(vocab))
embedding = embed_obj(tokens_ids)

embedding = embedding.unsqueeze(0)

pe_obj = PositionalEncoding(seq_len, d_model)
encoded_position = pe_obj(embedding)

#### Explanation of Self attention

In [5]:
print(encoded_position.shape)

encoded_position = encoded_position.squeeze(0)
print(encoded_position.shape)
print(encoded_position)

torch.Size([1, 17, 100])
torch.Size([17, 100])
tensor([[-11.3080,  -0.4911,  14.4497,  ...,   5.8652,  -6.9199,  21.8012],
        [ -5.2923,  -7.2075,  -1.4856,  ...,  -1.8305,  -6.6310,   1.0759],
        [ 14.5444, -25.4209,  -6.8304,  ...,  10.9295,   9.0678,  12.9104],
        ...,
        [  1.0780,   7.1567,  -0.9054,  ...,  -3.5234,   8.0529,   4.1451],
        [  4.7299,  -6.9776,  -9.9911,  ...,  10.8097,   4.8701,  19.3318],
        [ -8.4247,  15.4856, -16.2390,  ..., -11.5524,  -4.5310,   2.2455]],
       grad_fn=<SqueezeBackward1>)


In [6]:
print(f"Word tony, and it Embedding is\n {encoded_position[6]}")

Word tony, and it Embedding is
 tensor([ -1.7055,  -2.0192,   0.6594, -14.1280,   2.3142,  -2.4818,   2.6023,
         -7.8622,   0.8334,  -0.2319, -17.2297, -24.1096,   4.2452,   1.8019,
         -8.3507,   8.8680, -12.4589,   5.3780,  -5.6493,  -7.6722, -10.2631,
          6.1659,  -9.4200,   7.5622,  21.1879,  -8.5192,  -4.4970,  -0.4677,
         -2.5026,   0.2160,  -8.7706,  17.6558,   8.7257,  14.5058, -17.2108,
         11.1512,  11.6407,  -7.3563,  -2.3259,  -6.3369, -28.3395,  -3.1543,
         18.6783, -12.3554,  -3.7060,   2.9567,  -1.5735,   6.0819,  -4.3824,
          2.3967,  -6.7383,  -2.0250, -11.5031,  12.1552,  -9.3809,   5.1657,
          3.2447,  -3.6813,  -2.2434,  -2.5731, -10.1957,   8.1627,  16.3385,
          2.8388,   6.8943,   3.2725,  -1.5134,  -5.2289,   2.1171,  20.7504,
         -5.7956,   5.2520,  11.2568,  16.6707,   5.1538,   2.5021,  -9.6319,
         16.2356, -16.6142, -16.8092,   0.1687,   1.6515,   4.9326, -28.6015,
         -2.5305,  -3.2146,   6.

In [7]:
dim = encoded_position.shape[1]

d_q, d_k, d_v = 12, 12, 20

W_query = nn.Parameter(torch.randn(d_q, dim))
W_key = nn.Parameter(torch.randn(d_k, dim))
W_value = nn.Parameter(torch.randn(d_v, dim))


In [14]:
## For tony its caculated
print((W_key @ encoded_position.T).T[1])
print((W_key @ encoded_position.T).T[1].shape)

tensor([  15.6946,   21.9732,   16.1444,  104.4243,  238.8973, -138.9019,
         -47.1492,  -97.1442,  -48.5859, -181.1216,   85.7165,   53.9765],
       grad_fn=<SelectBackward0>)
torch.Size([12])


In [29]:
query_tony = W_query @ encoded_position[6]
print(query_tony.shape)

keys = (W_key @ encoded_position.T).T
print(keys.shape)

values = (W_value @ encoded_position.T).T
print(values.shape)


omega_tony = query_tony @ keys.T
print(omega_tony)
print(f"Most similar word for tony is {torch.argmax(omega_tony) + 1}nd token with score of {max(omega_tony)}")

torch.Size([12])
torch.Size([17, 12])
torch.Size([17, 20])
tensor([ 21642.4121,  38417.5898,  11507.6641,   4144.8604, -23516.4512,
         21029.3652,   7124.0415, -26574.6055, -11337.7773,  21508.5801,
        -75335.9922,    805.1486,  12766.4014,  13720.6562,  -9677.3672,
        -47455.4375,  29212.1992], grad_fn=<SqueezeBackward4>)
Most similar word for tony is 2nd token with score of 38417.58984375


In [36]:
import torch.nn.functional as F

scaled_dot  = F.softmax(omega_tony / np.sqrt(d_k)) 

# multiplying with values
context_vector_tony = scaled_dot @ values
print(context_vector_tony)

tensor([  84.8025,   39.4382,  -70.7152,  119.1056,  -70.6185, -202.8909,
         -66.1431,  -12.9007,  106.1452,  161.7344,  -55.3745,  116.2459,
        -211.0940,   55.9901,  -42.4252, -112.2250,   47.3509,  -94.3177,
         -35.5337,  -47.9927], grad_fn=<SqueezeBackward4>)


  scaled_dot  = F.softmax(omega_tony / np.sqrt(d_k))


In [81]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads ==0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.head_dim = d_model // num_heads ## d_k
        # d_k = d_v = d_model // num_heads

        ## Linear Projection for Q, K, V
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)

        # Output Layer
        self.out = nn.Linear(d_model, d_model)

        # scaling
        self.scale = np.sqrt(self.head_dim)
    
    def forward(self, q, k, v, mask):
        
        batch = q.size(0)

        query = self.linear_q(q)
        key = self.linear_k(k)
        value = self.linear_v(v)

        # Split Q, K, V into multiple heads
        query = query.view(batch, -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch, -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch, -1, self.num_heads, self.head_dim).transpose(1, 2)

        scores = (query @ key.transpose(-1, -2)) / self.scale
        # Scaled dot product attention
        if mask is not None:
            scores.masked_fill(mask == 0, float('-inf'))
        attention_weights = self.softmax(scores)
        attention_output = attention_weights @ value
        
        # Concatenate heads
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch, -1, self.num_heads * self.head_dim)
 
        # Final linear transformation
        output = self.out(attention_output)
 
        return output, attention_weights

In [55]:
encoded_position.unsqueeze(0).shape

torch.Size([1, 17, 100])

In [82]:
mha = MultiHeadAttention(100, 2)
# d_model % 8

In [83]:
mha(encoded_position, encoded_position, encoded_position, mask=None)

(tensor([[[ 6.5549,  0.4533,  5.9515,  ...,  1.3006,  1.3325,  0.3603]],
 
         [[-0.1755, -0.4881, -0.5081,  ...,  3.2096, -0.6231, -2.4404]],
 
         [[-3.5673, -3.7174, -0.2607,  ..., -1.1323, -0.7412,  0.6034]],
 
         ...,
 
         [[ 1.0202,  1.7797,  1.2958,  ..., -2.8006,  2.4137, -0.3167]],
 
         [[-1.2335, -3.2440,  2.6548,  ..., -3.4210, -9.5924,  4.3435]],
 
         [[ 1.0761, -6.5702, -1.9288,  ...,  1.4405,  2.5107, -3.0740]]],
        grad_fn=<ViewBackward0>),
 tensor([[[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
          [[1.]]],
 
 
         [[[1.]],
 
        