In [15]:
import tensorflow as tf
import numpy as np

class BatchNormLayer:
    def __init__(self, dim, epsilon=1e-5, momentum=0.9):
        self.gamma = tf.Variable(tf.ones([dim]), trainable=True)
        self.beta = tf.Variable(tf.zeros([dim]), trainable=True)
        self.epsilon = epsilon
        self.momentum = momentum
        self.running_mean = tf.Variable(tf.zeros([dim]), trainable=False)
        self.running_var = tf.Variable(tf.ones([dim]), trainable=False)

    def __call__(self, X, training=True):
        if training:
            batch_mean = tf.reduce_mean(X, axis=0)
            batch_var = tf.math.reduce_variance(X, axis=0)
            
            self.running_mean.assign(self.momentum * self.running_mean + (1 - self.momentum) * batch_mean)
            self.running_var.assign(self.momentum * self.running_var + (1 - self.momentum) * batch_var)
            
            X_norm = (X - batch_mean) / tf.sqrt(batch_var + self.epsilon)
        else:
            X_norm = (X - self.running_mean) / tf.sqrt(self.running_var + self.epsilon)

        return self.gamma * X_norm + self.beta

In [17]:
class SequentialTFModel:
    def __init__(self, input_dim):
        self.layers = []
        self.input_dim = input_dim
        self.prev_dim = input_dim

    def add(self, units, activation='relu', batchnorm=False):
        layer = {}
        layer['W'] = tf.Variable(tf.random.normal([self.prev_dim, units], stddev=0.1), trainable=True)
        layer['b'] = tf.Variable(tf.zeros([units]), trainable=True)
        layer['activation'] = activation
        layer['batchnorm'] = batchnorm
        if batchnorm:
            layer['bn'] = BatchNormLayer(units)
        self.layers.append(layer)
        self.prev_dim = units

    def forward(self, X, training=True):
        out = X
        for layer in self.layers:
            Z = tf.matmul(out, layer['W']) + layer['b']
            if layer['batchnorm']:
                Z = layer['bn'](Z, training)
                
            if layer['activation'] == 'relu':
                out = tf.nn.relu(Z)
            elif layer['activation'] == 'sigmoid':
                out = tf.nn.sigmoid(Z)
            else:
                raise ValueError("Unsupported activation")
                
        return out

    def train(self, X, Y, epochs=100, lr=0.01, loss_fn='mse'):
        optimizer = tf.optimizers.Adam(lr)

        for epoch in range(epochs):
            with tf.GradientTape() as tape:
                predictions = self.forward(X, training=True)
                if loss_fn == 'mse':
                    loss = tf.reduce_mean((predictions - Y) ** 2)
                elif loss_fn == 'bce':
                    loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(Y, predictions))
                else:
                    raise ValueError("Unsupported loss")

            # Gather all trainable variables
            variables = []
            for layer in self.layers:
                variables.extend([layer['W'], layer['b']])
                if layer['batchnorm']:
                    variables.extend([layer['bn'].gamma, layer['bn'].beta])

            grads = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(grads, variables))

            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss.numpy():.4f}")

    def predict(self, X):
        return self.forward(X, training=False)

In [23]:
X = tf.random.normal([100, 2])
Y = tf.cast(tf.reduce_sum(X, axis=1, keepdims=True) > 0, tf.float32)

# Model
model = SequentialTFModel(input_dim=2)
model.add(units=5, activation='relu', batchnorm=True)
model.add(units=1, activation='sigmoid')  
model.train(X, Y, epochs=100, lr=0.01, loss_fn='bce')

Epoch 0, Loss: 0.6941
Epoch 10, Loss: 0.6024
Epoch 20, Loss: 0.4980
Epoch 30, Loss: 0.3852
Epoch 40, Loss: 0.2881
Epoch 50, Loss: 0.2179
Epoch 60, Loss: 0.1699
Epoch 70, Loss: 0.1382
Epoch 80, Loss: 0.1175
Epoch 90, Loss: 0.1030


In [27]:
print("Actual:\n", Y.numpy().squeeze())
print("Predictions:\n", tf.round(model.predict(X)).numpy().squeeze())

Actual:
 [1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0.]
Predictions:
 [0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0.]
