## 0. Load inputs from last step

In [1]:
import torch

In [2]:
input = torch.load("intermediate_values/final_embedding.pt", weights_only=False)

In [3]:
print(input[0,:,:])
input.shape  # batch_size * in_seq_len * emd_dim

tensor([[-1.5668, -0.1368, -2.1955,  0.1668, -1.2070, -0.3723, -0.8302, -2.8953,
          1.8269, -0.3231],
        [ 1.3479,  0.5237, -0.7732, -0.4390, -1.2601, -0.9128,  1.7754,  1.2303,
         -4.0808,  0.9466],
        [ 1.5389, -1.7982,  2.1728,  0.5161,  1.5061,  0.9178, -0.7344, -0.9022,
         -0.5057,  1.5436],
        [-0.2512, -0.5905, -1.9205, -1.4082,  2.2990,  0.5391, -0.9468, -1.6840,
          0.3417,  1.7622],
        [-0.6245,  0.1875,  2.7057,  0.8237, -0.5156, -2.3120, -0.3651,  1.9557,
          0.2960, -0.3758],
        [ 0.3872,  2.9064,  2.4349, -0.0070, -1.0729, -2.1992,  2.6361,  0.7379,
          2.5921, -0.6817],
        [ 1.0945, -0.7756,  1.9551, -1.8756, -0.6402,  0.8235,  1.5774,  1.5828,
         -0.5951, -1.5694]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 1. Layer Norm 1
- Normalise features within a layer (not across batch)
- Mean and variance are calculated for all activations in a layer
- These are scaled and shifted to have standard normal distribution (mean=0; variance=1)
- Handles problems of Internal covariate shift

#### Terminologies
1. Activations = Outputs of the neurons
2. Internal covariate shift = During training as each layer takes inputs from previous layers and the input distribution keeps changing as each layer is learning. This leads to slow converges

#### Formula
### output = [scale * (input - mean)/ sqrt(variance + epsilon)] + shift
* mean = mean across activations
* variance = (Std_dev)^2 = Variance across activations
* epsilon = small constant to avoid division by zero
* scale, shift = learnable parameters

In [4]:
import torch.nn as nn

In [5]:
mean = input.mean(dim=-1, keepdim=True)  # Dim=-1 => Along Embedding size (which is learnt in previous layers)
print(mean)
mean.shape  # batch_size * in_seq_len * 1

tensor([[[-0.7533],
         [-0.1642],
         [ 0.4255],
         [-0.1859],
         [ 0.1776],
         [ 0.7734],
         [ 0.1578]],

        [[-0.3153],
         [ 0.2092],
         [ 0.5860],
         [-0.3911],
         [ 0.9981],
         [ 0.5299],
         [ 0.0224]],

        [[-0.6828],
         [ 1.1236],
         [ 1.0298],
         [ 0.0039],
         [ 0.1424],
         [ 0.1010],
         [ 1.1030]]], grad_fn=<MeanBackward1>)


torch.Size([3, 7, 1])

In [6]:
var = input.var(dim=-1, keepdim=True, unbiased=False)  # Turning off Unbiased avoids division by zero 
print(var)
var.shape # batch_size * in_seq_len * 1

tensor([[[1.5756],
         [2.7248],
         [1.5881],
         [1.8268],
         [1.7907],
         [2.9210],
         [1.7790]],

        [[0.6766],
         [2.1250],
         [2.2524],
         [2.0671],
         [0.5906],
         [1.5930],
         [4.0861]],

        [[1.5324],
         [2.5206],
         [2.1610],
         [1.6786],
         [1.5481],
         [3.2254],
         [2.8710]]], grad_fn=<VarBackward0>)


torch.Size([3, 7, 1])

In [7]:
epsilon = 1e-5
norm_input = (input - mean)/torch.sqrt(var + epsilon)
print(norm_input[0,:,:])
norm_input.shape  # batch_size * in_seq_len * emd_dim

tensor([[-0.6481,  0.4912, -1.1489,  0.7330, -0.3614,  0.3036, -0.0613, -1.7064,
          2.0556,  0.3427],
        [ 0.9161,  0.4168, -0.3689, -0.1665, -0.6639, -0.4535,  1.1750,  0.8448,
         -2.3727,  0.6729],
        [ 0.8835, -1.7645,  1.3866,  0.0719,  0.8575,  0.3907, -0.9204, -1.0536,
         -0.7389,  0.8872],
        [-0.0483, -0.2993, -1.2834, -0.9043,  1.8385,  0.5364, -0.5630, -1.1084,
          0.3904,  1.4414],
        [-0.5994,  0.0074,  1.8893,  0.4828, -0.5180, -1.8604, -0.4055,  1.3288,
          0.0885, -0.4135],
        [-0.2260,  1.2481,  0.9722, -0.4566, -1.0803, -1.7393,  1.0899, -0.0208,
          1.0641, -0.8514],
        [ 0.7024, -0.6998,  1.3476, -1.5245, -0.5982,  0.4991,  1.0644,  1.0684,
         -0.5645, -1.2949]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

In [8]:
emd_dim = 10

In [9]:
scale = nn.Parameter(torch.ones(emd_dim))
print(scale)
scale.shape

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)


torch.Size([10])

In [10]:
shift = nn.Parameter(torch.zeros(emd_dim))
print(shift)
shift.shape

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


torch.Size([10])

In [11]:
final_norm_input = (scale * norm_input) + shift
print(final_norm_input[0,:,:])
final_norm_input.shape  # batch_size * in_seq_len * emd_dim

tensor([[-0.6481,  0.4912, -1.1489,  0.7330, -0.3614,  0.3036, -0.0613, -1.7064,
          2.0556,  0.3427],
        [ 0.9161,  0.4168, -0.3689, -0.1665, -0.6639, -0.4535,  1.1750,  0.8448,
         -2.3727,  0.6729],
        [ 0.8835, -1.7645,  1.3866,  0.0719,  0.8575,  0.3907, -0.9204, -1.0536,
         -0.7389,  0.8872],
        [-0.0483, -0.2993, -1.2834, -0.9043,  1.8385,  0.5364, -0.5630, -1.1084,
          0.3904,  1.4414],
        [-0.5994,  0.0074,  1.8893,  0.4828, -0.5180, -1.8604, -0.4055,  1.3288,
          0.0885, -0.4135],
        [-0.2260,  1.2481,  0.9722, -0.4566, -1.0803, -1.7393,  1.0899, -0.0208,
          1.0641, -0.8514],
        [ 0.7024, -0.6998,  1.3476, -1.5245, -0.5982,  0.4991,  1.0644,  1.0684,
         -0.5645, -1.2949]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 2. Save to carry forward

In [12]:
torch.save(final_norm_input,"intermediate_values/layer_norm_1_output.pt")