To always pass a fixed length vector as input, sentences are padded with dummy sequence to reach the maximum number of words allowed in the transformer. Each word in the sentence is then one hot encoded to get the vector representation. Each vector is then mapped to a 512 dimensional vector by passing through a feed forward network. Now, to this we will add a positional vector of same size. Finally the output is sent to attention block.

In [1]:
import torch

In [2]:
# This is the maximum number of words that can be allowed in the transformer
max_sequence_length = 10

# Dimesion of the word embedding
d_model = 6

In [3]:
even_i = torch.arange(0, d_model, 2).float()
print(even_i)

tensor([0., 2., 4.])


In [4]:
# Calculating the denominator part for even positions

even_denominator = torch.pow(10000, even_i/d_model)
print(even_denominator)

tensor([  1.0000,  21.5443, 464.1590])


In [5]:
odd_i = torch.arange(1, d_model, 2).float()
print(odd_i)

tensor([1., 3., 5.])


In [6]:
# Calculating the denominator part for odd positions

odd_denominator = torch.pow(10000, (odd_i - 1)/d_model)
print(even_denominator)

tensor([  1.0000,  21.5443, 464.1590])


In [7]:
# Calculating the numerator part

position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
print(position)

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])


Sine and Cosine function are used in the positional encoding since they are periodic function and repeat after certain interval of time and constraints values between 1 and -1.

In [8]:
even_positional_encoding = torch.sin(position / even_denominator)
odd_positional_encoding = torch.cos(position / odd_denominator)

In [9]:
print(even_positional_encoding)

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0464,  0.0022],
        [ 0.9093,  0.0927,  0.0043],
        [ 0.1411,  0.1388,  0.0065],
        [-0.7568,  0.1846,  0.0086],
        [-0.9589,  0.2300,  0.0108],
        [-0.2794,  0.2749,  0.0129],
        [ 0.6570,  0.3192,  0.0151],
        [ 0.9894,  0.3629,  0.0172],
        [ 0.4121,  0.4057,  0.0194]])


In [10]:
print(even_positional_encoding.shape)

torch.Size([10, 3])


In [11]:
print(odd_positional_encoding)

tensor([[ 1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9989,  1.0000],
        [-0.4161,  0.9957,  1.0000],
        [-0.9900,  0.9903,  1.0000],
        [-0.6536,  0.9828,  1.0000],
        [ 0.2837,  0.9732,  0.9999],
        [ 0.9602,  0.9615,  0.9999],
        [ 0.7539,  0.9477,  0.9999],
        [-0.1455,  0.9318,  0.9999],
        [-0.9111,  0.9140,  0.9998]])


In [12]:
print(odd_positional_encoding.shape)

torch.Size([10, 3])


In [13]:
stacked = torch.stack([even_positional_encoding, odd_positional_encoding], dim=2)
stacked.shape

torch.Size([10, 3, 2])

In [14]:
positional_encoding = torch.flatten(stacked, start_dim=1, end_dim=2)
print(positional_encoding)

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])
