<a href="https://colab.research.google.com/github/sochachai/Transformer_Analysis/blob/main/Embbedings_PositionalEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load packages

In [6]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


Construct the class of Embeddings and Positional Encoding.

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        '''
        :param d_model: embedding dimension
        :param vocab: size of vocabulary
        '''
        # Initialization
        super(Embeddings, self).__init__()
        # Defrine a word embedding object
        self.lut = nn.Embedding(vocab, d_model)
        # Instantiate d_model
        self.d_model = d_model

    def forward(self, x):
        '''
        :param x: tensor representing the original text
        '''
        return self.lut(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        '''
        :param d_model: dimension of the encoding
        :param dropout: dropout rate from 0 to 1
        :param max_len: the maximum length of a sentence
        '''
        # Inherit the initialization of nn.Module
        super(PositionalEncoding, self).__init__()

        # Objectify dropout
        self.dropout = nn.Dropout(p=dropout)

        # Inherit a positional encoder matrix, max_len * d_model
        pe = torch.zeros(max_len, d_model)

        # Inherit an absolute position matrix, max_len * 1
        position = torch.arange(0, max_len).unsqueeze(1)

        # Define the conversion matrix, initialization with gap = 2
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0)/d_model))

        # Copy the absolute position matrix to the positional encoder matrix
        # by sine and cosine wave according to the parity of column indices
        pe[:, 0::2] = torch.sin(position * div_term) # even indiced columns are imputed by sine
        pe[:, 1::2] = torch.cos(position * div_term) # odd indiced columns are imputed by cosine

        # Extend pe to 3-dimensional tensor
        pe = pe.unsqueeze(0)

        # Register pe to a buffer, the buffer is not a parameter of the class
        # the buffer will not be updated along with the model update
        # but it can be loaded along with the model
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        :param x: Tensor of text
        :return: x + the positional encoding
        '''
        # Shrink the size of pe to save storage
        # by converting the second dimension, i.e. the dimension of max_len
        # to the size of the sentence len of x, i.e. the second dimension of x
        x = x + Variable(self.pe[:,:x.size(1)], requires_grad = False) # False: pe will not be updated
        return self.dropout(x)


Instantiate an example to demonstrate the use of Embeddings and Positional Encoding.

In [9]:
d_model = 512
vocab = 1000
dropout = 0.1
max_len = 60

original_text = Variable(torch.LongTensor([[132,8,521,308],[491,398,999,223]]))
emb = Embeddings(d_model, vocab)
embr = emb(original_text)
print(f"The resulting tensor after Embeddings:\n {embr}")

x= embr
pe = PositionalEncoding(d_model, dropout, max_len)
pe_result = pe(x)
print(f"The resulting tensor after positional encoding:\n {pe_result}")
print(f"The shape of pe_result: {pe_result.shape}")

The resulting tensor after Embeddings:
 tensor([[[ -1.6712, -12.1986,  -7.6789,  ...,  13.3293, -16.7797, -23.3994],
         [ 30.2555, -10.3327,  14.4755,  ...,   4.6873, -26.1814,  17.1574],
         [ 26.8043,  12.9459,  14.8024,  ...,  30.4907, -27.0993,  20.5009],
         [-14.7976,  20.7444, -53.3502,  ...,  19.1950,  40.0398,  17.6165]],

        [[-36.4085,   8.8077,  17.4784,  ...,  13.9540,  33.7637, -23.9044],
         [ 32.3228,  46.4586, -23.2815,  ..., -12.5101,  -7.3369, -11.7269],
         [ 25.0436,   3.4608, -44.5584,  ...,   6.7308,  -7.6718,   3.3365],
         [ 12.8997,   4.5520,  13.7686,  ...,  -0.1834, -41.6246,   7.7329]]],
       grad_fn=<MulBackward0>)
The resulting tensor after positional encoding:
 tensor([[[ -1.8568, -12.4429,  -8.5321,  ...,  15.9214, -18.6441,  -0.0000],
         [ 34.5522,  -0.0000,  16.9971,  ...,   6.3193, -29.0903,  20.1748],
         [  0.0000,  13.9220,  17.4876,  ...,  34.9897, -30.1101,  23.8899],
         [-16.2850,   0.0000,