## At first in the transformer layer we need to create the embedding layer
### Embedding Layer which means the converting the input into the presentation of vectors with the expected dimension

In [1]:
import torch
import torch.nn as nn
vocab_size = 10000  
embedding_dim = 512  
embedding = nn.Embedding(vocab_size, embedding_dim)
input_sequence = torch.tensor([[2, 5, 7, 3, 1]]) 
embedded_sequence = embedding(input_sequence)
print("Input Sequence:", input_sequence)
print("Embedded Sequence:", embedded_sequence)

Input Sequence: tensor([[2, 5, 7, 3, 1]])
Embedded Sequence: tensor([[[ 1.0937, -0.2105,  0.7825,  ...,  0.6322,  0.9590,  1.1196],
         [-0.0424, -1.2583,  1.5339,  ..., -2.4728,  1.6406,  1.2766],
         [-0.2880,  0.2182, -2.3618,  ...,  0.2940,  0.3400, -0.9175],
         [-0.2165, -0.8007,  2.1178,  ..., -0.0548,  0.8152,  0.2842],
         [ 0.2439, -2.5718,  0.7419,  ..., -1.9673,  0.5983, -0.0951]]],
       grad_fn=<EmbeddingBackward0>)


# Concept of Positional Encoding
## In the Transformer architecture, there is no inherent sequential structure in the model, unlike RNNs or LSTMs. To address this, positional encoding is used to inject information about the position of tokens in the sequence into the model. This allows the model to understand the order of tokens.

## Formula for Positional Encoding
## The positional encoding for each position 
𝑝 ----> p and each dimension 
𝑖 ----> i is computed using the following formulas:

## Even dimensions (sine function):

PE
(
𝑝
,
2
𝑖
)
=
sin
⁡
(
𝑝
1000
0
2
𝑖
/
𝑑
model
)
PE(p,2i)=sin( 
10000 
2i/d 
model
​
 
 
p
​
 )
Odd dimensions (cosine function):

PE
(
𝑝
,
2
𝑖
+
1
)
=
cos
⁡
(
𝑝
1000
0
2
𝑖
/
𝑑
model
)
PE(p,2i+1)=cos( 
10000 
2i/d 
model
​
 
 
p
​
 )
## Where:

𝑝 ----> p is the position of the token in the sequence.
𝑖 ----> i is the dimension index.
𝑑 ----> model
d ----> d is the dimensionality of the embeddings.

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length #Maximum Sequence Length
        self.d_model = d_model # Dimensions of the model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float() 
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE