In [1]:
import tensorflow as tf
import numpy as np

class SequentialTFModel:
    def __init__(self, input_dim):
        self.layers = []
        self.input_dim = input_dim
        self.prev_dim = input_dim

    def add(self, units, activation='relu'):
        W = tf.Variable(tf.random.normal([self.prev_dim, units], stddev=0.1), trainable=True)
        b = tf.Variable(tf.zeros([units]), trainable=True)
        self.layers.append({'W': W, 'b': b, 'activation': activation})
        self.prev_dim = units

    def forward(self, X):
        out = X
        for layer in self.layers:
            Z = tf.matmul(out, layer['W']) + layer['b']
            if layer['activation'] == 'relu':
                out = tf.nn.relu(Z)
            elif layer['activation'] == 'sigmoid':
                out = tf.nn.sigmoid(Z)
            else:
                raise ValueError("Unsupported activation")
        return out

    def clip(self, grads, clip_value):
        clipped = []
        for g in grads:
            if g is None:
                clipped.append(None)
                continue
            g_clipped = tf.where(g > clip_value, tf.fill(tf.shape(g), clip_value), g)
            g_clipped = tf.where(g_clipped < -clip_value, tf.fill(tf.shape(g), -clip_value), g_clipped)
            clipped.append(g_clipped)
        return clipped

    def clip_norm(self, grads, max_norm):
        total_norm = tf.sqrt(sum(tf.reduce_sum(g**2) for g in grads if g is not None))
        clipped = []
        for g in grads:
            if g is None:
                clipped.append(None)
                continue
            scale = tf.minimum(1.0, max_norm / (total_norm + 1e-6))
            g_clipped = g * scale
            clipped.append(g_clipped)
        return clipped

    def train(self, X, Y, epochs=100, lr=0.01, clip=None, norm=None, loss_fn='mse'):
        optimizer = tf.optimizers.Adam(lr)

        for epoch in range(epochs):
            with tf.GradientTape() as tape:
                predictions = self.forward(X)
                if loss_fn == 'mse':
                    loss = tf.reduce_mean((predictions - Y) ** 2)
                elif loss_fn == 'bce':
                    loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(Y, predictions))
                else:
                    raise ValueError("Unsupported loss")

            variables = [layer['W'] for layer in self.layers] + [layer['b'] for layer in self.layers]
            grads = tape.gradient(loss, variables)

            if clip is not None:
                grads = self.clip(grads, clip)
            if norm is not None:
                grads = self.clip_norm(grads, norm)

            optimizer.apply_gradients(zip(grads, variables))

            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss.numpy():.4f}")

    def predict(self, X):
        return self.forward(X)

In [2]:
X = tf.random.normal([100, 2])
Y = tf.cast(tf.reduce_sum(X, axis=1, keepdims=True) > 0, tf.float32)

model = SequentialTFModel(input_dim=2)
model.add(5, 'relu')
model.add(1, 'sigmoid')
model.train(X, Y, lr=0.01, epochs=100, clip=1.0, norm=5.0, loss_fn='bce')

print("Predictions:\n", tf.round(model.predict(X)).numpy().squeeze())

Epoch 0, Loss: 0.6928
Epoch 10, Loss: 0.6640
Epoch 20, Loss: 0.5956
Epoch 30, Loss: 0.4891
Epoch 40, Loss: 0.3718
Epoch 50, Loss: 0.2770
Epoch 60, Loss: 0.2134
Epoch 70, Loss: 0.1733
Epoch 80, Loss: 0.1475
Epoch 90, Loss: 0.1298
Predictions:
 [1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1.
 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.
 0. 1. 1. 1.]
