In [1]:
import torch
import torch.nn as nn

Defining LayerNorm architecture

In [2]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [3]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5) 
print(batch_example)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])


In [4]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
print(out_ln)

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)


In [5]:
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


Using Pytorch LayerNorm feature

In [6]:
ln_torch = nn.LayerNorm(5)
out_ln_torch = ln_torch(batch_example)
print(out_ln_torch)

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]],
       grad_fn=<NativeLayerNormBackward0>)


In [7]:
mean_torch = out_ln_torch.mean(dim=-1, keepdim=True)
var_torch = out_ln_torch.var(dim=-1, unbiased=False, keepdim=True)
print("Mean_torch:\n", mean_torch)
print("Variance_torch:\n", var_torch)

Mean_torch:
 tensor([[-1.7881e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Variance_torch:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
