<a href="https://colab.research.google.com/github/sobit-nep/Transformer-Neural-Network-from-scratch/blob/main/Layer_Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn

In [2]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [3]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

In [4]:
gamma.size(), beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [5]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

In [6]:
dims

[-1, -2]

In [7]:
mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [8]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [9]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [10]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [11]:
out = gamma * y + beta

In [12]:
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Class for reusability

In [13]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [14]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.6582, -0.8341, -1.3846,  1.5867,  0.2581, -1.9033, -1.8475,
          -0.2854],
         [ 0.6123, -0.9360, -1.4004,  0.2883,  0.9917,  0.0688,  0.0280,
          -0.5269],
         [ 0.3260,  0.5820,  0.2460,  0.2549,  0.5195, -1.1284, -0.1948,
          -0.7692]],

        [[ 0.3893, -1.0838, -0.8419,  2.5523, -1.7229,  0.4919, -1.1041,
           0.5163],
         [-0.9059,  0.0350, -0.4745,  0.8367,  1.3299,  0.1969,  1.1188,
          -1.8110],
         [-1.1036,  0.6325,  0.0871,  0.2015,  0.4920,  0.4249, -0.0884,
           0.7800]],

        [[-0.7446, -1.2254, -0.3047,  0.4283,  0.0642,  0.3386, -1.1208,
           0.0894],
         [-1.2498,  1.2458,  0.0080, -0.0237,  0.0533, -0.1207, -0.3037,
           0.8869],
         [-0.8955, -0.4221, -1.6601, -1.0594,  1.0127, -0.4657,  1.1418,
           0.0758]],

        [[-1.7145, -0.1119, -0.3125,  1.1601,  0.6609, -0.7731,  0.1862,
          -0.2007],
         [ 0.7766, -0.2800, 

In [15]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [16]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[-0.3440],
         [-0.1093],
         [-0.0205]],

        [[-0.1004],
         [ 0.0407],
         [ 0.1783]],

        [[-0.3094],
         [ 0.0620],
         [-0.2841]],

        [[-0.1382],
         [-0.3631],
         [-0.3359]],

        [[-0.1228],
         [-0.3199],
         [-0.4023]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[1.3284],
         [0.7472],
         [0.5852]],

        [[1.2828],
         [1.0046],
         [0.5535]],

        [[0.6072],
         [0.7054],
         [0.9196]],

        [[0.8179],
         [0.6352],
         [1.0986]],

        [[0.5861],
         [0.9538],
         [1.0048]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.5072, -0.3689, -0.7833,  1.4534,  0.4532, -1.1738, -1.1318,
           0.0441],
         [ 0.9657, -1.1064, -1.7280,  0.5321,  1.4735,  0.2384,  0.1837,
          -0.5589],
         [ 0.5921,  1.0295,  0.4554,  0.4706,  0.9227, -1.8931, -0.2978,
          -1.2793]],



In [None]:
out[0].mean(), out[0].std()

(tensor(9.9341e-09, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))