# Layer Normalization

We want to do layer normalization such that in each layer of the neural network the activation vectors are not spare and have mean ~ 0 and sd ~ 1. /

Suppose we have 2 word of 3 dimnension

X = [[0.2, 0.1, 0.3] , [0.5, 0.1, 0.1]]


If we calculate mu11 = 1/3 sum(0.2, 0.1, 0.3) = 0.2

and similarly mu21 = 0.233

and then,

sd11 and s21

μ = [mu11, mu22]
σ = [sd11, sd21]

Then layer normalization output will be γ * [X - μ] / σ + β

The output will be learnable params gamma and beta , for every single layer the mean is 0 and sd ~ 1. These values are more tractable and stable during training.



In [None]:
import torch
from torch import nn

inputs = torch.Tensor([[[0.2, 0.1, 0.3] , [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S,B, E)
inputs.size()

torch.Size([2, 1, 3])

In [None]:
parameter_shape = inputs.size()[-2:]
parameter_shape

torch.Size([1, 3])

In [None]:
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))
print(gamma.size())
print(beta.size())

torch.Size([1, 3])
torch.Size([1, 3])


 1 due to batch size and 3 as embedding dimensino

In [None]:
import torch
import torch.nn as nn

In [None]:
class LayerNormalization(nn.Module):
  def __init__(self, parameters_shape, eps = 1e-5):
    super().__init__()
    self.parameters_shape = parameters_shape # this determined along which dimension layer normalization is to be applied , here batch and
    self.eps = eps # this is to make the division non zero as sd can be zero and it's in division
    self.gamma = nn.Parameter(torch.ones(parameters_shape))
    self.beta = nn.Parameter(torch.zeros(parameters_shape))

  def forward(self, input):
    dims = [-(i+1) for i in range(len(self.parameters_shape))] # dimesnion along which we want to peform layer normalization
    mean = inputs.mean(dim=dims, keepdim=True) # calculate mean of the inputs for all the words
    print(f'Mean \n ({mean.size()}) : \n {mean}')
    var = ((inputs-mean)**2).mean(dim=dims, keepdim=True)
    std = (var + self.eps).sqrt()
    print(f'Standard Deviation \n ({std.size()}) : \n {std}')
    y = (inputs-mean)/std # normalization
    print(f"y \n ({y.size}) = \n {y}")
    out = self.gamma *y + self.beta
    print(f"out \n ({out.size()}) =. \n {out}")
    return out

In [None]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

In [None]:
layer_norm = LayerNormalization(inputs.size()[-2:])
layer_norm(inputs)

Mean 
 (torch.Size([5, 1, 1])) : 
 tensor([[[-0.1311]],

        [[ 0.0214]],

        [[ 0.1828]],

        [[ 0.3760]],

        [[-0.2984]]])
Standard Deviation 
 (torch.Size([5, 1, 1])) : 
 tensor([[[1.1732]],

        [[0.9153]],

        [[0.5976]],

        [[0.7279]],

        [[1.0790]]])
y 
 (<built-in method size of Tensor object at 0x7f831db370b0>) = 
 tensor([[[ 1.5343, -1.7197,  0.2753, -1.0941,  1.0343,  0.0885,  0.9377,
          -0.2599],
         [-0.3968, -0.9933,  1.1634, -1.6930, -1.9114,  0.9607, -1.4323,
           0.5926],
         [ 1.0643, -0.6483,  0.5650,  0.1134,  0.1231,  0.3545,  0.9177,
           0.4241]],

        [[-0.2804,  1.5267, -2.7503,  0.1455, -0.6535, -0.2610, -0.1430,
          -0.5587],
         [-0.3279,  0.4266,  0.7142,  0.4357, -0.5728, -1.1849,  0.3435,
           0.4275],
         [ 0.1490, -0.9237, -1.1703,  0.8467,  1.5381,  0.9582, -0.5764,
           1.8913]],

        [[ 0.4463, -0.1295, -0.6628, -2.6863,  0.5283, -1.6540, -0.0169

tensor([[[ 1.5343, -1.7197,  0.2753, -1.0941,  1.0343,  0.0885,  0.9377,
          -0.2599],
         [-0.3968, -0.9933,  1.1634, -1.6930, -1.9114,  0.9607, -1.4323,
           0.5926],
         [ 1.0643, -0.6483,  0.5650,  0.1134,  0.1231,  0.3545,  0.9177,
           0.4241]],

        [[-0.2804,  1.5267, -2.7503,  0.1455, -0.6535, -0.2610, -0.1430,
          -0.5587],
         [-0.3279,  0.4266,  0.7142,  0.4357, -0.5728, -1.1849,  0.3435,
           0.4275],
         [ 0.1490, -0.9237, -1.1703,  0.8467,  1.5381,  0.9582, -0.5764,
           1.8913]],

        [[ 0.4463, -0.1295, -0.6628, -2.6863,  0.5283, -1.6540, -0.0169,
           0.4133],
         [ 1.3373, -0.6678,  0.9884,  0.4277,  0.4927,  1.6403,  0.1338,
          -0.4098],
         [-0.8659, -1.5626,  0.2583,  1.1116,  0.7670,  0.9787, -0.5128,
          -0.3551]],

        [[-1.3153,  0.4873, -0.1138, -0.7095, -1.6862,  0.1491,  0.6494,
           0.1034],
         [ 1.6532,  0.5182,  0.3398,  0.7805,  0.8452,  1.1121, 