In [2]:
import numpy as np
import tensorflow as tf

In [5]:
inputs = tf.constant([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.shape
inputs = tf.reshape(inputs, [S, B, E])
inputs.shape

TensorShape([2, 1, 3])

In [6]:
inputs

<tf.Tensor: shape=(2, 1, 3), dtype=float32, numpy=
array([[[0.2, 0.1, 0.3]],

       [[0.5, 0.1, 0.1]]], dtype=float32)>

In [12]:
parameter_shape = inputs.shape[-2:]
gamma = tf.Variable(tf.ones(parameter_shape))
beta = tf.Variable(tf.zeros(parameter_shape))

In [13]:
gamma.shape, beta.shape

(TensorShape([1, 3]), TensorShape([1, 3]))

In [16]:
dims = [-1, -2]
mean = tf.reduce_mean(inputs, axis=dims, keepdims=True)
mean

<tf.Tensor: shape=(2, 1, 1), dtype=float32, numpy=
array([[[0.2       ]],

       [[0.23333335]]], dtype=float32)>

In [18]:
var = tf.reduce_mean((inputs - mean)**2, axis = dims, keepdims=True)
eps = 1e-5
std = tf.sqrt(var + eps)
std

<tf.Tensor: shape=(2, 1, 1), dtype=float32, numpy=
array([[[0.08171088]],

       [[0.18858834]]], dtype=float32)>

In [20]:
y = (inputs - mean) / std
y.numpy()

array([[[ 0.       , -1.2238272,  1.2238274]],

       [[ 1.4140146, -0.7070074, -0.7070074]]], dtype=float32)

In [22]:
out = gamma * y + beta
out

<tf.Tensor: shape=(2, 1, 3), dtype=float32, numpy=
array([[[ 0.       , -1.2238272,  1.2238274]],

       [[ 1.4140146, -0.7070074, -0.7070074]]], dtype=float32)>

## Class

In [28]:
class LayerNormalization():
    def __init__(self, parameter_shape, eps = 1e-5):
        self.parameter_shape = parameter_shape
        self.eps = eps
        self.gamma = tf.Variable(tf.ones(parameter_shape))
        self.beta = tf.Variable(tf.zeros(parameter_shape))

    def forward(self, input):
        dims = [-(i+1) for i in range(len(self.parameter_shape))]
        mean = tf.reduce_mean(input, axis = dims, keepdims=True)
        var = tf.reduce_mean((inputs - mean)**2, axis = dims, keepdims=True)
        std = tf.sqrt(var + self.eps)
        y = (input - mean) / std
        out = self.gamma * y + self.beta
        return out

In [29]:
LN = LayerNormalization(parameter_shape)

In [30]:
LN.forward(inputs)

<tf.Tensor: shape=(2, 1, 3), dtype=float32, numpy=
array([[[ 0.       , -1.2238272,  1.2238274]],

       [[ 1.4140146, -0.7070074, -0.7070074]]], dtype=float32)>

In [32]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 

inputs = tf.random.normal((sentence_length, batch_size, embedding_dim))
inputs.shape

TensorShape([5, 3, 8])

In [36]:
layer_norm = LayerNormalization(inputs.shape[-1:])

In [37]:
out = layer_norm.forward(inputs)

In [38]:
out

<tf.Tensor: shape=(5, 3, 8), dtype=float32, numpy=
array([[[ 0.24959044, -1.1048054 , -1.1708173 ,  0.08870648,
          1.0341773 ,  1.8087295 ,  0.08912182, -0.9947031 ],
        [-1.0594995 ,  0.20832284, -1.3972174 ,  0.55269593,
          1.7174428 ,  0.82128227,  0.1247665 , -0.96779376],
        [ 0.9477291 ,  0.49527913, -0.21491294, -0.47630686,
          1.7647134 , -1.5092286 , -1.0884578 ,  0.08118449]],

       [[ 0.82603425, -1.5432388 ,  0.56175447, -0.63570523,
         -0.6499303 ,  1.0715604 , -0.9503917 ,  1.319917  ],
        [ 1.6752467 , -1.4900435 , -0.49186012, -0.59815747,
         -0.8664233 ,  0.08232222,  1.1525109 ,  0.5364051 ],
        [ 1.0797075 , -0.09241336, -1.1581281 , -1.017531  ,
         -1.2740653 ,  0.8651607 ,  1.4316876 ,  0.16558167]],

       [[ 0.41860726,  1.8527715 , -0.7397673 , -0.00230139,
         -1.384277  , -0.31997353, -0.86404264,  1.0389833 ],
        [-0.6523569 , -1.2956734 ,  1.0990187 , -0.11400246,
          1.8878645 , -

In [42]:
tf.reduce_mean(out[0])

<tf.Tensor: shape=(), dtype=float32, numpy=-4.967054e-09>

In [43]:
tf.math.reduce_std(out[0])

<tf.Tensor: shape=(), dtype=float32, numpy=0.99999523>