# Coding the Real Transformer from Scratch 
*(Understand the basic 1 encoder and 1 decoder without masking transformer FIRST.)*

<img src="transformer images/transformer_model.PNG" alt="Sample Image" style="width:1000px;">

# Input Embeddings

<img src="transformer images/embedding.PNG" alt="Sample Image" style="width:1000px;">

In [4]:
import torch
import torch.nn as nn
import math

In [11]:
class InputEmbeddings(nn.Module):
    def __init__(self,d_model,vocab_size): # d_model : dimensions , vocab_size : How many words are there in the vocabulary.
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model) # This is in the paper.

```python
vocab = {"hello": 0, ",": 1, "there": 2}
sent = "hello , there"  # Input sentence
tokenized = [vocab[word] for word in sent.split()]  # Convert words to indices
input_tensor = torch.tensor(tokenized)
ie = InputEmbeddings(d_model=512, vocab_size=len(vocab))
output = ie.forward(input_tensor)
print(output.shape)
```
You can run this to see for yourself.

# Positional Encoding
<img src="transformer images/positional_encoding.PNG" alt="Sample Image" style="width:1000px;">

In [19]:
# d_model : We are creating another similar vector of size same as before.
# seq_len : Maximum length for the SENTENC, BECAUSE WE NEED TO CREATE 1 VECTOR FOR EACH POSITION AS IN PIC ABOVE.
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,seq_len,dropout):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # First we will build a matrix of seq_len,d_model. Because we need 512 features for each token. and total token is seq_len.
        pe = torch.zeros(seq_len,d_model)
        # create a position matrix that will represent the position of the token inside the sentence --> seq_len , 1
        position = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1) # --> (seq_len,1)
        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0) / d_model))
        # apply the sin to even and cos to odd position
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)

        # If you the pe to be saved when you save the model, we use register_buffer.
        self.register_buffer("pe",pe)
        
        pe = pe.unsqueeze(0) # (1,seq_len,d_model)

    def forward(self,x):
        x = x + (self.pe[:,:x.shape[1],:]).requires_grad(False)
        return self.dropout(x)

# Layer Normalization
<img src="transformer images/ln.PNG" alt="Sample Image" style="width:1000px;">

In [20]:
class LayerNormalization(nn.Module):
    def __init__(self,eps=10**-6):
        super().__init__()
        self.eps = eps ## Here eps is important because, if the denomitor is 0 , the the x will be undefined or very huge.
        self.alpha = nn.Parameter(torch.ones(1)) # Multiplied
        self.bias = nn.Parameter(torch.zeros(0)) # Added

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim= True)
        std = x.std(dim=-1,keepdim=True)
        return self.alpha * (x-mean) /(std+self.eps) + self.bias

# Feed Forward Block

In [21]:
class FeedForwardBlock(nn.Module):
    def __init__(self,d_model,ff_dim,dropout):
        super().__init__()
        self.linear_1 = nn.Linear(d_model,ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(ff_dim,d_model)
        self.relu = nn.ReLU()

    def forward(self,x):
        return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

# Multihead Attention
<img src="transformer images/Multiatt.PNG" alt="Sample Image" style="width:1000px;">

## Points to be noted. 
- We need to divide the d_model with number of head.
- self.d_k : See the picture above , the small matrices are dk , this means , if we are using multi head attention then each head will see the whole sentence but different part of the dimension from 512. Basically it is the size of that matrix.
- If we want some words to not to interact with other words , we mask them.
- As you can see , the formula for attention , we sqrt(dk) there , that means we are calculating it for head.
- So if we want some words to not to interact with others , we set them a very small value , so that after softmax , it will be nearly zero , and the model will not focus on those much.


In [None]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self,d_model,h,dropout): # h : It is the number of heads we want.
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h ==0 , "d_model is not divisible by h"
    
        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model,d_model)
        self.w_k = nn.Linear(d_model,d_model)
        self.w_v = nn.Linear(d_model,d_model)
    
        self.w_o = nn.Linear(d_model,d_model)
        self.dropuout = nn.Dropout(dropout)

    def forward(self,q,k,v,mask):
        query = self.w_q(q) # (Batch , seq_len , d_model) --> (Batch , seq_len , d_model)
        key = self.w_k(k)
        value = self.w_v(v)

        # batch,seq_len,d_model --> batch,seq_len,h,d_k --> batch , h , seq_len, d_k , This means each head will see the whole sentence but a smaller part of the embedding.
        query = query.view(query.shape[0],query.shape[1],self.h,self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1],self.h,self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1],self.h,self.d_k).transpose(1,2)