# BATCH NORM

![LN_BN](images/BN_LN.jpeg)
![BN](images/BatchNorm.png)
![BN_Eqn](images/BatchNorm_Eqn.jpeg)
![Limitation](images/Limitation_BN.jpeg)


# Layer Normalization

![LN](images/LN.png)
![LN_Eqn](images/LN_Eqn.jpeg)
![BN_vs_LN](images/BN_vs_LN.jpeg)
![Layer Norm](images/LayerNorm.jpeg)

# CODE for LN

In [3]:
import torch
from torch import nn

In [21]:
# B = Batch size, S = Sequence length, E = embedding dimension
inputs = torch.Tensor([[[1, 3, 5, 7], [3, 4, 6, 2], [8, 3, 2, 1]]])
B, S, E = inputs.size()
print(f"{B=}, {S=}, {E=}")
inputs = inputs.reshape(S, B, E)
print(f"inputs: {inputs}")
inputs.size()

B=1, S=3, E=4
inputs: tensor([[[1., 3., 5., 7.]],

        [[3., 4., 6., 2.]],

        [[8., 3., 2., 1.]]])


torch.Size([3, 1, 4])

In [22]:
parameter_shape = inputs.size()[-2:]          # Last two dimensions i.e., B and E
print(f"parameter_shape: {parameter_shape}")  # parameter_shape: torch.Size([1, 4])
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

parameter_shape: torch.Size([1, 4])


In [23]:
gamma.size(), beta.size()

(torch.Size([1, 4]), torch.Size([1, 4]))

tensor([[7.],
        [2.],
        [1.]])

In [14]:
dims = [-(i + 1) for i in range(len(parameter_shape))]  

In [8]:
dims

[-1, -2]

In [25]:
mean = inputs.mean(dim=dims, keepdim=True) # mean along feature dimension
mean.size()

torch.Size([3, 1, 1])

In [26]:
mean

tensor([[[4.0000]],

        [[3.7500]],

        [[3.5000]]])

In [27]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[2.2361]],

        [[1.4790]],

        [[2.6926]]])

In [28]:
y = (inputs - mean) / std
y

tensor([[[-1.3416, -0.4472,  0.4472,  1.3416]],

        [[-0.5071,  0.1690,  1.5213, -1.1832]],

        [[ 1.6713, -0.1857, -0.5571, -0.9285]]])

In [29]:
out = gamma * y + beta

In [30]:
out

tensor([[[-1.3416, -0.4472,  0.4472,  1.3416]],

        [[-0.5071,  0.1690,  1.5213, -1.1832]],

        [[ 1.6713, -0.1857, -0.5571, -0.9285]]], grad_fn=<AddBackward0>)

## Class

In [35]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [32]:
batch_size = 5
sentence_length = 3
embedding_dim = 4 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([3, 5, 4])) = 
 tensor([[[-0.6704,  1.7031,  1.3378,  0.5833],
         [ 0.1546,  0.2288, -0.3751,  0.2744],
         [-0.0678,  1.2969, -1.3091, -0.4520],
         [ 0.7685, -0.6087, -0.0037, -0.1917],
         [-0.9480,  0.7051,  0.9688,  0.0346]],

        [[ 0.2190,  0.6910, -0.5335, -1.0923],
         [-1.4141,  0.4817, -0.4755, -0.7524],
         [-0.8872, -0.9566, -1.0666, -0.7134],
         [-1.1805, -0.4164,  0.3994, -0.4730],
         [ 0.7336,  1.0893,  0.9216, -1.6269]],

        [[-0.3296,  0.8377, -0.9043, -0.5067],
         [-0.3818,  0.4713,  0.8439,  0.4572],
         [ 0.6249, -0.2641,  0.1295, -0.8046],
         [-0.5721,  0.5586, -1.5924, -0.3381],
         [ 1.2902,  1.5171, -1.0928,  0.0590]]])


In [33]:
inputs.size()[-1:]

torch.Size([4])

In [36]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [37]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([3, 5, 1])): 
 tensor([[[ 0.7384],
         [ 0.0707],
         [-0.1330],
         [-0.0089],
         [ 0.1901]],

        [[-0.1790],
         [-0.5401],
         [-0.9059],
         [-0.4176],
         [ 0.2794]],

        [[-0.2257],
         [ 0.3476],
         [-0.0786],
         [-0.4860],
         [ 0.4434]]])
Standard Deviation 
 (torch.Size([3, 5, 1])): 
 tensor([[[0.9081],
         [0.2609],
         [0.9399],
         [0.4994],
         [0.7401]],

        [[0.6847],
         [0.6814],
         [0.1283],
         [0.5596],
         [1.1078]],

        [[0.6483],
         [0.4488],
         [0.5243],
         [0.7656],
         [1.0461]]])
y 
 (torch.Size([3, 5, 4])) = 
 tensor([[[-1.5514,  1.0622,  0.6600, -0.1709],
         [ 0.3217,  0.6059, -1.7085,  0.7809],
         [ 0.0694,  1.5213, -1.2513, -0.3394],
         [ 1.5567, -1.2011,  0.0104, -0.3660],
         [-1.5378,  0.6958,  1.0521, -0.2101]],

        [[ 0.5812,  1.2706, -0.5179, -1.3340],
     

In [38]:
out[0].mean(), out[0].std()

(tensor(7.4506e-09, grad_fn=<MeanBackward0>),
 tensor(1.0260, grad_fn=<StdBackward0>))