# Coding the Real Transformer from Scratch 
*(Understand the basic 1 encoder and 1 decoder without masking transformer FIRST.)*

<img src="transformer images/transformer_model.PNG" alt="Sample Image" style="width:1000px;">

# Input Embeddings

<img src="transformer images/embedding.PNG" alt="Sample Image" style="width:1000px;">

In [None]:
import torch
import torch.nn as nn
import math

In [None]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None: # d_model : dimensions , vocab_size : How many words are there in the vocabulary.
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model) # This is in the paper.

```python
vocab = {"hello": 0, ",": 1, "there": 2}
sent = "hello , there"  # Input sentence
tokenized = [vocab[word] for word in sent.split()]  # Convert words to indices
input_tensor = torch.tensor(tokenized)
ie = InputEmbeddings(d_model=512, vocab_size=len(vocab))
output = ie.forward(input_tensor)
print(output.shape)
```
You can run this to see for yourself.

# Positional Encoding
<img src="transformer images/positional_encoding.PNG" alt="Sample Image" style="width:1000px;">

In [None]:
# d_model : We are creating another similar vector of size same as before.
# seq_len : Maximum length for the SENTENC, BECAUSE WE NEED TO CREATE 1 VECTOR FOR EACH POSITION AS IN PIC ABOVE.
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # First we will build a matrix of seq_len,d_model. Because we need 512 features for each token. and total token is seq_len.
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

# Layer Normalization
<img src="transformer images/ln.PNG" alt="Sample Image" style="width:1000px;">

In [None]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps ## Here eps is important because, if the denomitor is 0 , the the x will be undefined or very huge.
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

# Feed Forward Block

In [None]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# Multihead Attention
<img src="transformer images/Multiatt.PNG" alt="Sample Image" style="width:1000px;">

## Points to be noted. 
- We need to divide the d_model with number of head.
- self.d_k : See the picture above , the small matrices are dk , this means , if we are using multi head attention then each head will see the whole sentence but different part of the dimension from 512. Basically it is the size of that matrix.
- If we want some words to not to interact with other words , we mask them.
- As you can see , the formula for attention , we sqrt(dk) there , that means we are calculating it for head.
- So if we want some words to not to interact with others , we set them a very small value , so that after softmax , it will be nearly zero , and the model will not focus on those much.

### Arguments:

- d_model: Dimensionality of the input embeddings. This is the size of the input vectors.
- h: Number of attention heads. Each head processes a subset of the input embedding.
- dropout: Probability for dropout, used to prevent overfitting.

### Importance of d_k

- Multi-head attention divides the d_model dimension into h smaller chunks (d_k).
- The embeddings are split across the heads, allowing the model to focus on multiple parts of the sequence in parallel.
- The size of each attention head's embedding (d_model / h).
- Smaller chunks of embeddings are easier to compute and allow independent attention mechanisms in each head.

### w_o importance
- w_o: This merges the attention outputs from all heads back into the original embedding space.

### Attention Scores
- query @ key.transpose(-2, -1):
  - Performs scaled dot-product attention.
  - query: (batch, h, seq_len, d_k)
  - key.transpose(-2, -1): (batch, h, d_k, seq_len)
  - Output: (batch, h, seq_len, seq_len).
- Division by sqrt(d_k):
  - Scales the scores to stabilize gradients (avoids very large/small values).

### Why Masking?
- Prevents the model from attending to padding tokens in the sequence by setting their scores to a very low value (-1e9).

###  Reshaping and Transposing

``` query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)```
- Reshaping:
  - Splits the last dimension (d_model) into h heads and d_k dimensions: (batch, seq_len, d_model) → (batch, seq_len, h, d_k).
- Transposing:
  - Brings h to the second dimension for computation: (batch, seq_len, h, d_k) → (batch, h, seq_len, d_k).
 
### Restoring Shape

```x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)```
- Transpose Back:
  - Brings the h dimension back to its original position.
- Contiguous and View:
  - Ensures the tensor is in memory order and reshaped to (batch, seq_len, d_model).
 
### Final Linear Transformation

```return self.w_o(x)```
- Purpose:
  - Combines the attention outputs of all heads back into the original embedding space.

## Understanding the Dimension Manipulation over here.
#### Linear Projection (nn.Linear)
Each of q, k, v passes through a learnable linear layer to project the input embeddings into new spaces:

```python
query = self.w_q(q)  # Shape: (Batch, Seq_Len, d_model)
key   = self.w_k(k)  # Shape: (Batch, Seq_Len, d_model)
value = self.w_v(v)  # Shape: (Batch, Seq_Len, d_model)
```


- Significance:
  - The Linear layer ensures that the embeddings can be transformed into different spaces for the Query, Key, and Value. This is necessary because their roles in attention computation differ.
 
#### Reshape Using view: Splitting Heads
- The next operation reshapes the tensor to separate the d_model dimension into multiple heads (h), each of size d_k:

```python
query = query.view(Batch, Seq_Len, h, d_k)
key   = key.view(Batch, Seq_Len, h, d_k)
value = value.view(Batch, Seq_Len, h, d_k)
```


```Input Shape: (Batch, Seq_Len, d_model)```

```Output Shape: (Batch, Seq_Len, h, d_k)```

#### Why Split Heads?

- Multi-head attention allows multiple subspaces (heads) to attend to different parts of the sequence independently.
d_k = d_model / h: Each head operates on a smaller embedding space.
Example: For Batch=2, Seq_Len=4, d_model=12, h=3:

```Before view: (2, 4, 12)```

```After view: (2, 4, 3, 4) (3 heads, each with 4 dimensions).```

#### Transpose: Rearrange Dimensions for Attention
The tensor is then transposed to make the h (heads) dimension adjacent to the Batch dimension:

```python
query = query.transpose(1, 2)  # Shape: (Batch, Seq_Len, h, d_k) → (Batch, h, Seq_Len, d_k)
key   = key.transpose(1, 2)    # Same transformation
value = value.transpose(1, 2)  # Same transformation
```

```Input Shape: (Batch, Seq_Len, h, d_k)```

```Output Shape: (Batch, h, Seq_Len, d_k)```

#### Why Transpose?

- Attention is calculated independently for each head.
- By moving h next to Batch, we can easily parallelize computations for all heads.
```Example: For Batch=2, Seq_Len=4, h=3, d_k=4:```

```Before transpose: (2, 4, 3, 4)```

```After transpose: (2, 3, 4, 4) (Batch=2, Heads=3, Seq_Len=4, d_k=4).```

#### Intuition Behind Dimensions and Their Role
- Batch and h adjacency: Each batch contains multiple sequences. Each sequence is split into h heads, so this adjacency allows us to calculate attention across all heads in one step.
- Seq_Len and d_k: After transposition, these remain the dimensions of focus for each head. Each head processes the same sequence length (Seq_Len) but focuses on its respective subspace of features (d_k).
#### Flow of Computation
- Preparation: The input embeddings are split into h heads using a linear layer, producing (Batch, Seq_Len, h, d_k).

- Transpose: Rearrange dimensions with .transpose(1, 2) to get (Batch, h, Seq_Len, d_k). Now:

  - Each head’s attention operates on its slice (Seq_Len, d_k).
  - Computation for all heads is parallelized because Batch and h are adjacent.
- Attention Calculation: Compute attention per head:

- *𝑄𝐾⊤* becomes ```(Batch, h, Seq_Len, Seq_Len).```
- Multiply with V, producing (Batch, h, Seq_Len, d_k).
#### Reconstruction: 
- Concatenate the heads back after attention computations and project the combined output back to the original embedding space.

#### Why Does Moving h Next to Batch Help?
- Placing h next to Batch ensures all heads are processed simultaneously by modern hardware (e.g., GPUs/TPUs).
- The resulting shape (Batch, h, Seq_Len, d_k) simplifies tensor operations because heads are now treated as additional batches for computation purposes.

#### Why self.w_o?
Now, the self.w_o layer is needed because:

- **Projection into the desired output space:** Even though the output tensor x has the correct shape (Batch, Seq_Len, d_model), it doesn't necessarily have the correct content yet.
  - The purpose of self.w_o is to project the concatenated and attended outputs of all heads back into the desired feature space.
  - This is important because, during the attention process, each head computes its own set of attention scores, and the resulting tensor might not represent the desired feature mapping.
  - self.w_o ensures that the output is appropriately transformed back into the space of d_model features that the model expects after the attention step.
- **Learnable transformation:** self.w_o introduces learnable parameters that allow the model to learn how to combine the results from different heads and project them into the appropriate feature space.
  - The learned weights in self.w_o allow the model to mix and combine information from all attention heads more effectively.

## Understanding the Formula: Example with Simple Values

Let’s work through a simple example to see how the attention formula works.

### Input Sentence:
Consider the sentence "The cat sleeps." (This is a simplified example with 3 words).

Let’s assume that each word has been embedded into a vector of size \( d_k = 3 \), and we'll use toy values for simplicity.

### Step 1: Construct Query, Key, and Value Matrices

For simplicity, let’s assume:

\[
Q =
\begin{bmatrix}
1 & 0 & 0 \\
0 & 1 & 0 \\
0 & 0 & 1
\end{bmatrix}, 
\quad 
K =
\begin{bmatrix}
1 & 1 & 0 \\
0 & 1 & 1 \\
1 & 0 & 1
\end{bmatrix}, 
\quad 
V =
\begin{bmatrix}
0 & 1 & 1 \\
1 & 0 & 1 \\
1 & 1 & 0
\end{bmatrix}
\]

Each row in \( Q \), \( K \), and \( V \) corresponds to a vector representation of the words "The", "cat", and "sleeps", respectively.

### Step 2: Compute the Dot Product \( QK^T \)

Now, we compute the dot product between \( Q \) and \( K^T \):

\[
QK^T =
\begin{bmatrix}
1 & 0 & 0 \\
0 & 1 & 0 \\
0 & 0 & 1
\end{bmatrix}
\begin{bmatrix}
1 & 1 & 0 \\
0 & 1 & 1 \\
1 & 0 & 1
\end{bmatrix}
=
\begin{bmatrix}
1 & 0 & 1 \\
0 & 1 & 0 \\
1 & 0 & 1
\end{bmatrix}
\]

This gives us a matrix that indicates how much attention each word should pay to each other word. For example, the first row \( [1, 0, 1] \) means the word "The" pays attention to "The" and "sleeps", but not to "cat".

### Step 3: Scale by \( d_k \)

For \( d_k = 3 \), we scale the dot product by \( \sqrt{d_k} \):

\[
QK^T = 
\begin{bmatrix}
\frac{1}{\sqrt{3}} & 0 & \frac{1}{\sqrt{3}} \\
0 & \frac{1}{\sqrt{3}} & 0 \\
\frac{1}{\sqrt{3}} & 0 & \frac{1}{\sqrt{3}}
\end{bmatrix}
\]

### Step 4: Apply Softmax

Next, we apply the softmax to each row of the matrix to normalize the values. 

\[
\text{Softmax}(QK^T) = 
\text{Softmax} \left( 
\begin{bmatrix}
\frac{1}{\sqrt{3}} & 0 & \frac{1}{\sqrt{3}} \\
0 & \frac{1}{\sqrt{3}} & 0 \\
\frac{1}{\sqrt{3}} & 0 & \frac{1}{\sqrt{3}}
\end{bmatrix}
\right)
=
\begin{bmatrix}
0.5 & 0.25 & 0.25 \\
0.25 & 0.5 & 0.25 \\
0.5 & 0.25 & 0.25
\end{bmatrix}
\]

This gives the normalized attention weights for each word in the sequence, indicating how much focus each word should pay to others.

### Step 5: Weighted Sum with Value Matrix

Now, we multiply the attention scores with the Value matrix \( V \):

\[
\text{Attention Output} = 
\text{Softmax}(QK^T) \times V =
\begin{bmatrix}
0.5 & 0.25 & 0.25 \\
0.25 & 0.5 & 0.25 \\
0.5 & 0.25 & 0.25
\end{bmatrix}
\begin{bmatrix}
0 & 1 & 1 \\
1 & 0 & 1 \\
1 & 1 & 0
\end{bmatrix}
\]

The resulting matrix is:

\[
\text{Attention Output} =
\begin{bmatrix}
0.5 & 0.75 & 0.5 \\
0.25 & 0.25 & 0.5 \\
0.5 & 0.75 & 0.5
\end{bmatrix}
\]

### Result:

The output of this step is a new set of vectors that are influenced by the attention scores. These vectors are now context-aware because each word’s representation has been adjusted based on its relationship to other words in the sequence. For example, the vector for "The" has been adjusted based on its relationship to "sleeps" and "cat".


In [None]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k) 
        #  batch,seq_len,d_model --> batch,seq_len,h,d_k --> batch , h , seq_len, d_k , This means each head will see the whole sentence but a smaller part of the embedding.
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
        return self.w_o(x)
        

# Residual Connection
Here the sublayer can be the multi head attention layer. Check the diagram.

In [None]:
class ResidualConnection(nn.Module):
    
        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)
    
        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

# Encoder Block

In [None]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask): # we use mask in encoding, because we want to hide the pad tokens.
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x
    
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

# Decoder Block

In [None]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x
    
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

# Linear Layer

In [None]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

# Transformer Block

In [None]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

# Build Transformers

In [None]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=1, h: int=1, dropout: float=0.1, d_ff: int=512) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

# TOKENIZER

In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path
from torch.utils.data import Dataset, DataLoader,random_split

## Build Tokenizer

In [None]:
def get_all_sentences(ds,lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config,ds,lang):
    # config['tokenizer_file'] = '..tokenizers/tokenizer_{0}.json'
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"], min_frequency=2) # min_frequency A word to have it in our vocab , the frequency of that word must be 2.
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

## Load the dataset , build the tokenizer and Split

### Overview of BilingualDataset Class:
This is a custom Dataset class used for preparing a bilingual dataset for sequence-to-sequence tasks, such as machine translation. It takes pairs of source and target texts, tokenizes them, and prepares them in the required format for feeding into a model.

```python
def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
    super().__init__()
    self.ds = ds
    self.tokenizer_src = tokenizer_src
    self.tokenizer_tgt = tokenizer_tgt
    self.src_lang = src_lang
    self.tgt_lang = tgt_lang

    self.sos_token = torch.Tensor([tokenizer_src.token_to_id(['SOS'])], dtype=torch.int64)  # SOS token for source
    self.eos_token = torch.Tensor([tokenizer_src.token_to_id(['EOS'])], dtype=torch.int64)  # EOS token for source
    self.pad_token = torch.Tensor([tokenizer_src.token_to_id(['PAD'])], dtype=torch.int64)  # PAD token for source
```
#### Purpose of __init__: 
- The constructor initializes the dataset and tokenizer for both the source and target languages, and sets up the special tokens (SOS, EOS, PAD) for the source language using the source tokenizer.
- sos_token, eos_token, and pad_token are tensors holding the indices of the corresponding tokens in the tokenizer's vocabulary.

```python
def __len__(self):
    return len(self.ds)
```


#### Purpose of __len__: 
- Returns the number of data samples in the dataset, which is simply the length of self.ds. This is needed for PyTorch to know the total number of items in the dataset when iterating.

```python
def __getitem__(self, index):
    src_target_pair = self.ds[index]
    src_text = src_target_pair['translation'][self.src_lang]
    tgt_text = src_target_pair['translation'][self.tgt_lang]
```

#### Purpose of getitem: 
- This function retrieves a source-target pair from the dataset self.ds at the given index.
- src_text and tgt_text are the actual source and target sentences based on the src_lang and tgt_lang keys in the dataset.

#### Tokenization and Padding Calculation:
```python
enc_input_tokens = self.tokenizer_src.encode(src_text).ids
dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # Adding SOS and EOS to the source
dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1  # Adding only SOS to the target
```

#### Purpose of padding calculation: 
- The sentences (src_text and tgt_text) are tokenized into numerical indices using the source and target tokenizers (tokenizer_src and tokenizer_tgt).
- Padding tokens are calculated to ensure the source and target sequences are of the desired length seq_len. We subtract 2 for source padding (SOS and EOS), and 1 for target padding (SOS only, since we stop predicting once EOS is generated).

#### Padding and Token Concatenation:
```python
if enc_num_padding_tokens < 0 or doc_num_padding_tokens < 0:
    raise ValueError('Sentence is too long')

# Add SOS and EOS to the source text
encoder_input = torch.cat([
    self.sos_token,
    torch.Tensor(enc_input_tokens, dtype=torch.int64),
    self.eos_token,
    torch.Tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
])

# Add SOS to the target text
decoder_input = torch.cat([
    self.sos_token,
    torch.Tensor(dec_input_tokens, dtype=torch.int64),
    torch.Tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
])

# Add EOS to the label text
label = torch.cat([
    torch.Tensor(dec_input_tokens, dtype=torch.int64),
    self.pad_token,
    torch.Tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
])
```

Purpose of cocatenation:
- encoder_input: The source sequence is concatenated with the SOS token at the start, the EOS token at the end, and padding tokens (if necessary).
- decoder_input: The target sequence is concatenated with the SOS token at the start and padding tokens at the end.
- label: The target sequence (without SOS) is followed by the EOS token and then padded. This represents the true target output for the decoder.

#### Assertion to Check Lengths:
```python
assert encoder_input.size(0) == self.seq_len
assert decoder_input.size(0) == self.seq_len
assert label.size(0) == self.seq_len
```

- Purpose of assertion: Ensures that the length of encoder_input, decoder_input, and label matches the desired seq_len.

#### Masks and Returning the Data:
```python
return {
    "encoder_input": encoder_input,  # (seq_len)
    "decoder_input": decoder_input,  # (seq_len)
    "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).int(),  # (1, 1, seq_len)
    "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),  # (1, seq_len) & (1, seq_len, seq_len)
    "label": label,
    "src_text": src_text,
    "target_text": tgt_text
}
```

#### Purpose of mask and returning data: 
- Returns a dictionary containing:
- encoder_input, decoder_input, and label: The tokenized and padded inputs and target sequences.
- encoder_mask: A mask for the encoder, where padding tokens are marked as 0 and non-padding tokens as 1.
- decoder_mask: A mask for the decoder, including the causal mask (explained below).
- src_text, target_text: The original source and target text sentences.
- Causal Mask:
```python
def casual_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0
```
#### Purpose of casual mask:
- This function creates a causal mask used in the decoder to prevent attending to future tokens during training (as per the "autoregressive" nature of the decoder). It ensures that each position in the decoder can only attend to itself and previous positions, but not to future tokens.
- The torch.triu() function creates an upper triangular matrix, and mask == 0 ensures that only the lower triangular part is True (valid positions), while the rest are False (invalid positions).

In [None]:
class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # we will add only <s> at the start. So to skip that we did -1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask == 0



def get_ds(config):
    ds_raw = load_dataset('opus_books',f"{config['lang_src']}-{config['lang_tgt']}",split='train')

    # build the tokenizer
    tokenizer_src = get_or_build_tokenizer(config,ds_raw,config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config,ds_raw,config['lang_tgt'])

    # keep 90% for training and 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw , val_ds_raw = random_split(ds_raw,[train_ds_size,val_ds_size])

    # train_ds and val_ds
    train_ds = BilingualDataset(train_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])
    val_ds  = BilingualDataset(val_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])

    max_len_src = 0
    max_len_tgt = 0
    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_src.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src , len(src_ids))
        max_len_tgt = max(max_len_tgt,len(tgt_ids))
    print(f"Max length of Soure sentence : {max_len_src}")
    print(f"Max length of target sentence : {max_len_tgt}")

    train_dataloader = DataLoader(train_ds,batch_size=config['batch_size'],shuffle= True)
    val_dataloader = DataLoader(val_ds, batch_size = 1 , shuffle = True)

    return train_dataloader , val_dataloader, tokenizer_src , tokenizer_tgt
    

In [None]:
%%writefile config.py
from pathlib import Path
def get_config():
    return {
        "batch_size" : 8,
        "num_epochs" : 20,
        "lr" : 10**-4,
        "seq_len" : 350,
        "d_model" : 512,
        "datasource": 'opus_books',
        "lang_src" : "en",
        "lang_tgt" : "it",
        "model_folder" : "weights",
        "model_basename" : "tmodel_",
        "preload" : None,
        "tokenizer_file" : "tokenizer_{0}.json",
        "experiment_name" : "runs/tmodel"
    }
def get_weights_file_path(config,epoch:str):
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

# Training loop

In [None]:
def get_model(config,vocab_src_len,vocab_tgt_len):
    model = build_transformer(vocab_src_len,vocab_tgt_len,config['seq_len'],config['seq_len'],config['d_model'])
    return model


In [None]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from config import get_config,get_weights_file_path

def train_model(config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using Device {device}")

    # Create the model Folder
    Path(config['model_folder']).mkdir(parents=True,exist_ok=True)

    # dataset 
    train_dataloader , val_dataloader, tokenizer_src , tokenizer_tgt = get_ds(config)
    
    # model
    model = get_model(config,tokenizer_src.get_vocab_size() , tokenizer_tgt.get_vocab_size()).to(device)

    # TensorBoard
    writer = SummaryWriter(config['experiment_name'])

    # optimizer
    optimizer = torch.optim.Adam(model.parameters(),lr = config['lr'],eps=1e-9)

    # Premodel loading
    initial_epoch = 0
    global_step = 0
    if config['preload']:
        model_filename = get_weights_file_path(config,config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    # loss function
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'),label_smoothing=0.1).to(device)

    # Training loop
    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

# ## TRAIN THE MODEL
config = get_config()
# train_model(config)

In [None]:
from config import *
def translate(sentence: str):
    # Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    config = get_config()
    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
    tokenizer_tgt = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_tgt']))))
    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    state = torch.load(r"weights\tmodel_19.pt")
    model.load_state_dict(state['model_state_dict'])

    # if the sentence is a number use it as an index to the test set
    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        ds = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='all')
        ds = BilingualDataset(ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
        sentence = ds[id]['src_text']
        label = ds[id]["tgt_text"]
    seq_len = config['seq_len']

    # translate the sentence
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source = tokenizer_src.encode(sentence)
        source = torch.cat([
            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64), 
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_tgt.token_to_id('[SOS]')).type_as(source).to(device)

        # Print the source sentence and target start prompt
        if label != "": print(f"{f'ID: ':>12}{id}") 
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}") 
        print(f"{f'PREDICTED: ':>12}", end='')

        # Generate the translation word by word
        while decoder_input.size(1) < seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

            # print the translated word
            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_tgt.token_to_id('[EOS]'):
                break

    # convert ids to tokens
    return tokenizer_tgt.decode(decoder_input[0].tolist())

In [None]:

translate("Hello how are you?")