<a href="https://colab.research.google.com/github/sochachai/Transformer_Analysis/blob/main/Embeddings_PositionalEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


In [4]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        '''
        :param d_model: embedding dimension
        :param vocab: size of vocabulary
        '''
        # Initialization
        super(Embeddings, self).__init__()
        # Defrine a word embedding object
        self.lut = nn.Embedding(vocab, d_model)
        # Instantiate d_model
        self.d_model = d_model

    def forward(self, x):
        '''
        :param x: tensor representing the original text
        '''
        return self.lut(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        '''
        :param d_model: dimension of the encoding
        :param dropout: dropout rate from 0 to 1
        :param max_len: the maximum length of a sentence
        '''
        # Inherit the initialization of nn.Module
        super(PositionalEncoding, self).__init__()

        # Objectify dropout
        self.dropout = nn.Dropout(p=dropout)

        # Inherit a positional encoder matrix, max_len * d_model
        pe = torch.zeros(max_len, d_model)

        # Inherit an absolute position matrix, max_len * 1
        position = torch.arange(0, max_len).unsqueeze(1)

        # Define the conversion matrix, initialization with gap = 2
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0)/d_model))

        # Copy the absolute position matrix to the positional encoder matrix
        # by sine and cosine wave according to the parity of column indices
        pe[:, 0::2] = torch.sin(position * div_term) # even indiced columns are imputed by sine
        pe[:, 1::2] = torch.cos(position * div_term) # odd indiced columns are imputed by cosine

        # Extend pe to 3-dimensional tensor
        pe = pe.unsqueeze(0)

        # Register pe to a buffer, the buffer is not a parameter of the class
        # the buffer will not be updated along with the model update
        # but it can be loaded along with the model
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        :param x: Tensor of text
        :return: x + the positional encoding
        '''
        # Shrink the size of pe to save storage
        # by converting the second dimension, i.e. the dimension of max_len
        # to the size of the sentence len of x, i.e. the second dimension of x
        x = x + Variable(self.pe[:,:x.size(1)], requires_grad = False) # False: pe will not be updated
        return self.dropout(x)


In [5]:
d_model = 512
vocab = 1000
dropout = 0.1
max_len = 60

original_text = Variable(torch.LongTensor([[132,8,521,308],[491,398,999,223]]))
emb = Embeddings(d_model, vocab)
embr = emb(original_text)

x= embr
pe = PositionalEncoding(d_model, dropout, max_len)
pe_result = pe(x)
print(pe_result)
print(pe_result.shape)

tensor([[[-11.6798,  -6.4811,   8.4353,  ..., -52.9447,  -0.0000,   6.1799],
         [-32.0869,  -4.5898,   0.0000,  ..., -59.1861,  11.5512,   2.9967],
         [-32.3059,  22.2263,  -0.2133,  ...,  -0.0000,  21.0923, -51.6224],
         [ 12.4760,  -0.0000, -13.0740,  ..., -19.4793, -18.7379,  -6.0210]],

        [[ 55.1934, -10.0560,  -3.2923,  ...,  -9.3783,  -5.6828, -64.7009],
         [ -3.0917,  30.0435, -51.4251,  ...,  34.1320,  84.6426,  -3.4831],
         [  1.8231,  24.3843,   0.0000,  ...,   7.6301,   3.5043, -18.7053],
         [-12.6825,  -0.0000,  34.8947,  ...,  -6.0370,  35.6415,  59.9986]]],
       grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
