## ---------------------Layer Normalization------------------------------

## 1. Simple Example

![image.png](attachment:3f2b6225-202f-4cab-8ea5-d864d0a809c6.png)

## A. Getting the raw layer output 

In [13]:
import torch

# Create a simple input
torch.manual_seed(42)
batch_example = torch.rand(2, 5)
print(batch_example.shape)
# The linear layer will accept 5 units and give an output of 6 units
layers = torch.nn.Sequential(
    # Linear Layer (2, 5) x (5, 6) = (2, 6)
    torch.nn.Linear(5, 6),
    # Activation Layer
    torch.nn.ReLU()
)
# Passing the input in the linear layers
output = layers(batch_example)
print(output, output.shape)

torch.Size([2, 5])
tensor([[0.8085, 0.0080, 0.0357, 0.0000, 0.4141, 0.2423],
        [0.7231, 0.0000, 0.1389, 0.0105, 0.7284, 0.3024]],
       grad_fn=<ReluBackward0>) torch.Size([2, 6])


## B. Output Normalization 

In [14]:
# Getting the mean and variances - each batch is processed separately

mean = output.mean(dim = -1, keepdim=True)
variance = output.var(dim = -1, keepdim=True)
print("Mean", mean) 
print("Variance", variance)

Mean tensor([[0.2514],
        [0.3172]], grad_fn=<MeanBackward1>)
Variance tensor([[0.1012],
        [0.1121]], grad_fn=<VarBackward0>)


In [15]:
# Normalization

output_norm = (output - mean) / torch.sqrt(variance)
mean_norm = output_norm.mean(dim=-1, keepdim=True)
var_norm = output_norm.var(dim=-1, keepdim=True)
print("mean", mean_norm)
print("variance", var_norm)

mean tensor([[-9.9341e-09],
        [-7.9473e-08]], grad_fn=<MeanBackward1>)
variance tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## 2. Creating the Full Normalization layer class

In [None]:
# Implementing the normalization layer - Normalization does not change the dimension of the input
from torch import nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        # Defining the epsilon -  small constant added to the variance to prevent zero division - undefined - limits
        self.eps = 1e-5
        # Defining the scaling and shifting parameters - trainable - better results
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.scale = nn.Parameter(torch.ones(emb_dim))
    # Forward pass
    def forward(self, x):
        # Getting the mean and variance of each row
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True)
        # Getting the normalization values
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        # Returning the normalized values of x shifted and scaled - finetuning parameters
        return self.scale * norm_x + self.shift

In [5]:
import torch
torch.manual_seed(12)
x = torch.rand(5)
torch.manual_seed(12)
y = torch.rand(5)
x, y

(tensor([0.4657, 0.2328, 0.4527, 0.5871, 0.4086]),
 tensor([0.4657, 0.2328, 0.4527, 0.5871, 0.4086]))