Embedding + PE -> Encoder \
PE (pos, 2i) = sin(pos/ 10000^(2i/d_model)) \
PE (pos, 2i+1) = sin(pos/ 10000^(2i/d_model)) \
Here pos is the position of the word in the sequence and i is the index of the dimension and d_model is the dimension of the model here 512. \
Why we have taken cosin and sin function due to periodicity a word can pay attention to word at farther distance much better due to periodicity. And it constraints the value between -1 and 1. \
During the attention suppose without sine and cosine the word at position i would not be able to attend to words farther away as their position keeps growing as compared to current word i but the sine and cosine helps to maintain the order range from -1 to 1 eliminating this limitation. \
Easy to extrapolate to longer sequences or sequence lengths which are not there in the training set.

In [None]:
# Let's walk through the code
import torch
import torch.nn as nn

max_sequence_length = 10
d_model = 6 # dimension of the embeddings

In [None]:
even_i = torch.arange(0, d_model, 2).float() # create values from 0-d_model skipping 2 so taking even positions
even_i

tensor([0., 2., 4.])

In [None]:
even_denominator = torch.pow(10000, even_i/d_model)
even_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [None]:
# same thing for odd dimensions
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([1., 3., 5.])

In [None]:
odd_denominator = torch.pow(10000, odd_i/d_model)
odd_denominator

tensor([   4.6416,  100.0000, 2154.4343])

In [None]:
# Now since i-1 is even so denominators becomes essentially the same?? For this first see how the positional encoding is reformulated.

In [None]:
denominator = even_denominator

In [None]:
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1) # it determines the position of each word in the sequence
position.shape

torch.Size([10, 1])

In [None]:
even_PE = torch.sin(position/ denominator)
even_PE

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0464,  0.0022],
        [ 0.9093,  0.0927,  0.0043],
        [ 0.1411,  0.1388,  0.0065],
        [-0.7568,  0.1846,  0.0086],
        [-0.9589,  0.2300,  0.0108],
        [-0.2794,  0.2749,  0.0129],
        [ 0.6570,  0.3192,  0.0151],
        [ 0.9894,  0.3629,  0.0172],
        [ 0.4121,  0.4057,  0.0194]])

In [None]:
odd_PE = torch.cos(position / denominator)
odd_PE

tensor([[ 1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9989,  1.0000],
        [-0.4161,  0.9957,  1.0000],
        [-0.9900,  0.9903,  1.0000],
        [-0.6536,  0.9828,  1.0000],
        [ 0.2837,  0.9732,  0.9999],
        [ 0.9602,  0.9615,  0.9999],
        [ 0.7539,  0.9477,  0.9999],
        [-0.1455,  0.9318,  0.9999],
        [-0.9111,  0.9140,  0.9998]])

These are all 10x3 because odd and even are separated in embedding which was of size 10x6

In [None]:
# now we have to interleave the odd and even indices to get 10x6 dimension positional encodin
stacked = torch.stack([even_PE, odd_PE], dim=-1)
stacked.shape

torch.Size([10, 3, 2])

In [None]:
# Now flatten it
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE.shape

torch.Size([10, 6])

In [None]:
PE

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

In [None]:
# class Implementation


import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_length):
    super().__init__()
    self.max_seq_length = max_seq_length
    self.d_model = d_model

  def forward(self):
    even_i = torch.arange(0, self.d_model, 2).float()
    odd_i = torch.arange(1, self.d_model, 2).float()

    denominator = torch.pow(10000, even_i/self.d_model)
    position = torch.arange(self.max_seq_length).reshape(self.max_seq_length, 1)

    even_PE = torch.sin(position/denominator)
    odd_PE = torch.cos(position/denominator)

    stacked = torch.stack([even_PE, odd_PE], dim = 2)

    PE = torch.flatten(stacked, start_dim=1, end_dim=2)

    return PE

In [None]:
pe = PositionalEncoding(d_model=6, max_seq_length=10)
pe() # forward automaitcally get's called for class with nn.Module

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])