## 0. Load inputs from last step

In [1]:
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
input = torch.load("intermediate_values/final_embedding.pt")

  input = torch.load("intermediate_values/final_embedding.pt")


In [28]:
print(input[0,:,:])
input.shape  # batch_size * in_seq_len * emd_dim

tensor([[ 2.3231, -0.3746,  2.6221,  2.2130, -1.3411,  0.1152, -0.6356, -1.3310,
         -0.4394,  2.4190],
        [ 0.5539, -5.1864, -0.5007,  1.0695, -0.8989, -1.4327, -0.4214,  1.9148,
          1.2416,  0.7608],
        [-2.2256, -0.2482,  0.4517, -1.5689,  0.4506, -0.1861,  1.5037, -1.0158,
         -0.2976, -2.1102],
        [-0.0212, -0.4272,  2.1284, -0.9909, -0.2254, -0.0528,  0.5645,  0.5093,
          0.2662,  2.1440],
        [ 0.3954, -0.0420, -0.1249, -2.8000,  0.8817,  2.2713,  1.8423, -0.6637,
         -2.0547,  1.2039],
        [-1.6732, -0.2235, -0.5283,  1.7224, -1.0452,  0.9322, -1.4510,  0.5735,
         -0.7763,  0.3509],
        [-0.8276,  1.5135,  0.4042,  0.4883, -2.3238, -0.6278,  1.0873,  2.3286,
         -0.1649,  0.0505]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 1. Layer Norm 1
- Normalise features within a layer (not across batch)
- Mean and variance are calculated for all activations in a layer
- These are scaled and shifted to have standard normal distribution (mean=0; variance=1)
- Handles problems of Internal covariate shift

#### Terminologies
1. Activations = Outputs of the neurons
2. Internal covariate shift = During training as each layer takes inputs from previuous layers and the input distribution keeps changing as each layer is learning. This leads to slow converges

#### Formula
### output = [scale * (input - mean)/ sqrt(variance + epsilon)] + shift
* mean = mean across activations
* variance = (Std_dev)^2 = Variance across activations
* epsilon = small constant to avoid division by zero
* scale, shift = learnable parameters

In [5]:
import torch.nn as nn

In [15]:
mean = input.mean(dim=-1, keepdim=True)  # Dim=-1 => Along Embedding size (which is learnt in previous layers)
print(mean)
mean.shape  # batch_size * in_seq_len * 1

tensor([[[ 0.5571],
         [-0.2900],
         [-0.5246],
         [ 0.3895],
         [ 0.0909],
         [-0.2119],
         [ 0.1928]],

        [[ 0.5571],
         [ 0.3854],
         [-1.1656],
         [ 0.8640],
         [ 0.3570],
         [ 0.6311],
         [ 0.7219]],

        [[-0.0954],
         [ 0.3295],
         [-0.7187],
         [ 0.4752],
         [ 0.3832],
         [ 0.2767],
         [ 0.5931]]], grad_fn=<MeanBackward1>)


torch.Size([3, 7, 1])

In [16]:
var = input.var(dim=-1, keepdim=True, unbiased=False)  # Turning off Unbiased avoids division by zero 
print(var)
var.shape # batch_size * in_seq_len * 1

tensor([[[2.4238],
         [3.6585],
         [1.3000],
         [0.9477],
         [2.3373],
         [1.0768],
         [1.5434]],

        [[2.4238],
         [1.7528],
         [2.7279],
         [1.8420],
         [1.6635],
         [2.9621],
         [3.1118]],

        [[2.2060],
         [1.2612],
         [1.4636],
         [1.6757],
         [0.5687],
         [1.3383],
         [1.9449]]], grad_fn=<VarBackward0>)


torch.Size([3, 7, 1])

In [26]:
epsilon = 1e-5
norm_input = (input - mean)/torch.sqrt(var + epsilon)
print(norm_input[0,:,:])
norm_input.shape  # batch_size * in_seq_len * emd_dim

tensor([[ 1.1343, -0.5984,  1.3264,  1.0636, -1.2192, -0.2838, -0.7661, -1.2127,
         -0.6400,  1.1960],
        [ 0.4412, -2.5600, -0.1102,  0.7108, -0.3184, -0.5975, -0.0687,  1.1527,
          0.8007,  0.5493],
        [-1.4918,  0.2425,  0.8563, -0.9159,  0.8554,  0.2970,  1.7789, -0.4308,
          0.1991, -1.3906],
        [-0.4218, -0.8389,  1.7862, -1.4180, -0.6316, -0.4544,  0.1798,  0.1230,
         -0.1266,  1.8022],
        [ 0.1991, -0.0869, -0.1412, -1.8910,  0.5172,  1.4262,  1.1456, -0.4936,
         -1.4034,  0.7280],
        [-1.4083, -0.0112, -0.3050,  1.8640, -0.8031,  1.1025, -1.1941,  0.7568,
         -0.5440,  0.5423],
        [-0.8214,  1.0631,  0.1701,  0.2378, -2.0257, -0.6605,  0.7200,  1.7191,
         -0.2879, -0.1146]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

In [21]:
emd_dim = 10

In [23]:
scale = nn.Parameter(torch.ones(emd_dim))
print(scale)
scale.shape

Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True)


torch.Size([10])

In [22]:
shift = nn.Parameter(torch.zeros(emd_dim))
print(shift)
shift.shape

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


torch.Size([10])

In [27]:
final_norm_input = (scale * norm_input) + shift
print(final_norm_input[0,:,:])
final_norm_input.shape  # batch_size * in_seq_len * emd_dim

tensor([[ 1.1343, -0.5984,  1.3264,  1.0636, -1.2192, -0.2838, -0.7661, -1.2127,
         -0.6400,  1.1960],
        [ 0.4412, -2.5600, -0.1102,  0.7108, -0.3184, -0.5975, -0.0687,  1.1527,
          0.8007,  0.5493],
        [-1.4918,  0.2425,  0.8563, -0.9159,  0.8554,  0.2970,  1.7789, -0.4308,
          0.1991, -1.3906],
        [-0.4218, -0.8389,  1.7862, -1.4180, -0.6316, -0.4544,  0.1798,  0.1230,
         -0.1266,  1.8022],
        [ 0.1991, -0.0869, -0.1412, -1.8910,  0.5172,  1.4262,  1.1456, -0.4936,
         -1.4034,  0.7280],
        [-1.4083, -0.0112, -0.3050,  1.8640, -0.8031,  1.1025, -1.1941,  0.7568,
         -0.5440,  0.5423],
        [-0.8214,  1.0631,  0.1701,  0.2378, -2.0257, -0.6605,  0.7200,  1.7191,
         -0.2879, -0.1146]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 2. Save to carry forward

In [29]:
torch.save(final_norm_input,"intermediate_values/layer_norm_1_output.pt")