In [2]:
import torch
import torch.nn as nn
import math

In [13]:
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps: float = 1e-6) -> None:
        super().__init__()
        self.eps = eps
        self.gamma  = nn.Parameter(torch.ones(features))  # Learnable scaling factor
        self.beta  = nn.Parameter(torch.zeros(features))  # Learnable shift factor

    def forward(self, x):
        # Compute mean and std along the last dimension (hidden_size)
        mean = x.mean(dim=-1, keepdim=True)
        print("mean Tensor:\n", mean)
        std = x.std(dim=-1, keepdim=True)
        print("std Tensor:\n", std)
        # Normalize and apply learnable parameters
        return self.gamma * (x - mean) / (std + self.eps) + self.beta 



In [15]:
# Simulate a batch of sequence embeddings (e.g., from a Transformer)
batch_size = 2
seq_len = 4
d_model = 3
input_tensor = torch.tensor([
    [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]],
    [[1.0, 3.0, 5.0], [2.0, 4.0, 6.0], [3.0, 5.0, 7.0], [4.0, 6.0, 8.0]],
], dtype=torch.float32)  # Shape: (batch_size, seq_len, hidden_size)

print("Input Tensor:\n", input_tensor)

# Initialize Layer Normalization with the hidden size
layer_norm = LayerNormalization(features=d_model)

# Apply layer normalization
normalized_output = layer_norm(input_tensor)

print("\nNormalized Output:\n", normalized_output)
print("\nLearnable Alpha (Scaling):\n", layer_norm.gamma)
print("\nLearnable Bias (Shift):\n", layer_norm.beta)
     


Input Tensor:
 tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.],
         [ 7.,  8.,  9.],
         [10., 11., 12.]],

        [[ 1.,  3.,  5.],
         [ 2.,  4.,  6.],
         [ 3.,  5.,  7.],
         [ 4.,  6.,  8.]]])
mean Tensor:
 tensor([[[ 2.],
         [ 5.],
         [ 8.],
         [11.]],

        [[ 3.],
         [ 4.],
         [ 5.],
         [ 6.]]])
std Tensor:
 tensor([[[1.],
         [1.],
         [1.],
         [1.]],

        [[2.],
         [2.],
         [2.],
         [2.]]])

Normalized Output:
 tensor([[[-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000]],

        [[-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000],
         [-1.0000,  0.0000,  1.0000]]], grad_fn=<AddBackward0>)

Learnable Alpha (Scaling):
 Parameter containing:
tensor([1., 1., 1.], requires_grad=True)

Learnable Bias (Shift):
 Parameter containin