## **Layer Normalization**

In [1]:
import torch

from torch import nn

In [2]:
torch.manual_seed(123)

batch_example=torch.randn(2,5)

layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())

In [3]:
out=layer(batch_example)

print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [4]:
mean=out.mean(dim=-1,keepdim=True)
var=out.var(dim=-1,keepdim=True)

print(f"Mean: \n {mean}\n\n Variance: \n {var}")

Mean: 
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)

 Variance: 
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [5]:
out_norm=(out-mean)/torch.sqrt(var)

mean=out_norm.mean(dim=-1,keepdim=True)
var=out_norm.var(dim=-1,keepdim=True)

print(f"Mean: \n {mean}\n\n Variance: \n {var}")

Mean: 
 tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)

 Variance: 
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [6]:
torch.set_printoptions(sci_mode=False)

mean=out_norm.mean(dim=-1,keepdim=True)
var=out_norm.var(dim=-1,keepdim=True)

print(f"Mean: \n {mean}\n\n Variance: \n {var}")

Mean: 
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)

 Variance: 
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [None]:
## Implemented
# class LayerNorm(nn.Module):
#     def __init__(self,emb_dim):
#         super().__init__()
#         self.eps=1e-5
#         self.scale=nn.Parameter(torch.ones(emb_dim))
#         self.shift=nn.Parameter(torch.zeros(emb_dim))
    
#     def forward(self,x):
#         mean=x.mean(dim=-1,keepdim=True)
#         var=x.var(dim=-1,keepdim=True,unbiased=False)
#         norm_x=(x-mean)/torch.sqrt(var+self.eps)
#         return norm_x * self.scale+ self.shift

## Optimized and Production Level

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) * torch.rsqrt(var + self.eps)
        return norm_x * self.scale + self.shift

In [8]:
ln=LayerNorm(emb_dim=5)

In [9]:
out_ln=ln(batch_example)

In [11]:
out_ln.var(dim=-1,keepdim=True,unbiased=False)

tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)