In [1]:
import numpy as np
from typing import Union

In [2]:
Array = Union[float, np.ndarray]

class ReLU:

    """
    ReLU Activation Function

    Forward:
        f(x) = max(0, x)

    Backward:
        f'(x) = 1 if x > 0 else 0
    """
    def forward(self, x: Array) -> Array:
        """Applies ReLU Activation function to input X."""

        self.input = x
        return np.maximum(0, x)

    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of ReLU with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (self.input > 0)

In [3]:
class Sigmoid:
    """
    Sigmoid Activation Function

    Forward:
        f(x) = 1 / (1 + e^(-x))

    Backward:
        f'(x) = f(x) * (1 - f(x))
    """

    def forward(self, x: Array) -> Array:
        """Applies Sigmoid Activation function to input X."""
        
        self.output = 1 / (1 + np.exp(-x))
        return self.output
    
    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of Sigmoid with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (self.output * (1 - self.output))

In [4]:
class Tanh:
    """
    Tanh Activation Function

    Forward:
        tanh(x) = (e^x - e^-x) / (e^x + e^-x)

    Backward:
        d/dx tanh(x) = 1 - tanh(x)^2
    """

    def forward(self, x: Array) -> Array:
        """Applies Tanh Activation function to input X."""
        
        self.output = np.tanh(x)
        return self.output
        
    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of Tanh with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (1 - (self.output ** 2))

In [5]:
class LeakyReLU:
    """
    LeakyReLU Activation Function

    Forward:
        f(x) = max(0, αx)

    Backward:
        f'(x) = 1 if x >= 0 else α
    """

    def __init__(self, alpha: float = 0.01):
        self.alpha = alpha
        
    def forward(self, x: Array) -> Array:
        """Applies LeakyReLU Activation function to input X."""
        self.input = x
        return np.where(x >= 0, x, self.alpha * x)

    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of LeakyReLU with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (np.where(self.input >= 0, 1, self.alpha))

In [6]:
class Softmax:
    """
    Softmax Activation Function

    Forward:
        softmax(z_i) = e^(z_i) / sum(e^(z_j))

    Backward:
        Returns gradient of softmax output w.r.t input z
        Usually combined with CrossEntropy for efficient backprop
    """

    def forward(self, x: Array) -> Array:
        
        """Applies Softmax Activation function to input X."""

        x = np.atleast_2d(x)
        x_shifted = x - np.max(x, axis=1, keepdims=True)
        exps = np.exp(x_shifted)
        self.output = exps / np.sum(exps, axis=1, keepdims=True)
        return self.output

    def backward(self, grad_output: Array) -> Array:
        """
        Compute softmax derivative using the Jacobian matrix.
        Rarely used alone — often fused with cross-entropy for speed.
        """
        batch_size, classes = grad_output.shape
        grad_input = np.ones_like(grad_output)

        for b in range(batch_size):
            y = self.output[b].reshape(-1, 1)
            jacobian = np.diagflat(y) - np.dot(y, y.T)
            grad_input[b] = np.dot(jacobian, grad_output[b])

        return grad_input

In [7]:
class CrossEntropywithSoftmax:
    
    def __init__(self):
        self.logit: Union[np.ndarray, None] = None
        self.probs: Union[np.ndarray, None] = None
        self.target: Union[np.ndarray, None] = None


    def forward(self, x: Array, target: Array) -> float:

        self.logit = x
        self.target = target

        x = np.atleast_2d(x)
        shifted_x = x - np.max(x, axis=1, keepdims=True)
        exps = np.exp(shifted_x)
        self.probs = exps / np.sum(exps, axis=1, keepdims=True)

        if target.ndim == 1:
            one_hot = np.zeros_like(x)
            one_hot[np.arange(len(target)), target] = 1
            self.target = one_hot

        eps = 1e-10
        loss_prob = np.log(np.clip(self.probs, eps, 1 - eps))
        loss = -np.sum(loss_prob * self.target) / x.shape[0]
        return loss

    def backward(self):

        return (self.probs - self.target) / self.logit.shape[0]

In [8]:
class Dense:

    def __init__(self, input_dim: int, output_dim: int):
        """
        Initialize weights and bias.
        Weights are small random values.
        Bias is initialized to zero.
        """

        self.input_dim = input_dim
        self.output_dim = output_dim

        limit = np.sqrt(6 / (input_dim + output_dim))
        self.w = np.random.uniform(-limit, limit, (output_dim, input_dim))
        self.b = np.zeros((1, output_dim))

        self.input: Union[np.ndarray, None] = None
        self.dw: Union[np.ndarray, None] = None
        self.db: Union[np.ndarray, None] = None

    def forward(self, x: Array) -> Array:
        """
        Forward pass: output = xW^T + b
        """
        self.input = np.atleast_2d(x)
        return np.dot(self.input, self.w.T) + self.b

    def backward(self, grad_output: Array) -> Array:
        """
        Backward pass:
        - Compute gradients for weights and bias
        - Return gradient w.r.t input for previous layer
        """
        grad_output = np.atleast_2d(grad_output)

        self.dw = np.dot(grad_output.T, self.input)
        self.db = np.sum(grad_output, axis=0, keepdims=True)

        grad_input = np.dot(grad_output, self.w)

        return grad_input

    def update(self, lr: float = 0.01):
        """
        Update weights and biases using gradient descent
        """
        self.w -= lr * self.dw
        self.b -= lr * self.db

In [9]:
class Model:

    def __init__(self):
        self.layers = []

    def add(self, layer):
        """Add a new layer (Dense, Activation) to the model."""
        self.layers.append(layer)

    def forward(self, x):
        """Forward pass through all layers"""
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad_output):
        """Backward pass through all layers"""
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output)
        return grad_output

    def fit(self, x, y, epochs, loss_fn, lr = 0.01):
        for epoch in range(epochs):
            output = self.forward(x)

            loss = loss_fn.forward(output, y)

            loss_grad = loss_fn.backward()
            self.backward(loss_grad)

            for layer in self.layers:
                if hasattr(layer, "update"):
                    layer.update(lr)
            print(f"Epoch {epoch+1}/{epochs} | Loss: {loss:.6f}")

In [44]:
class MSELoss:
    
    def forward(self, predicted, True_lables):
        predicted = np.atleast_2d(predicted)
        self.predicted = predicted
        self.True_lables = True_lables
        return 0.5 * np.mean((predicted - True_lables) ** 2)
        
    def backward(self):
        return (self.True_lables - self.predicted) / self.True_lables.shape[0]

In [11]:
# class SGD:

#     def __init__(self, lr: float = 0.01):
#         self.lr = lr

#     def step(self, layers):
#         for layer in layers:
#             if hasattr(layer, "w"):
#                 layer.w -= self.lr * layer.dw
#                 layer.b -= self.lr * layer.db

In [12]:
x = np.random.uniform(-1, 1, (3, 5))
x

array([[-0.13471852,  0.45952206, -0.32474822, -0.92289803,  0.03506049],
       [ 0.59406466, -0.16697921, -0.1507749 ,  0.21418862,  0.20444496],
       [-0.41766161,  0.72988176,  0.53904327, -0.83876719, -0.91689403]])

In [17]:
y_true = np.array([
    [0, 1],
    [1, 0],
    [1, 0]
])

In [32]:
from tensorflow import keras

In [33]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [35]:
x_train.shape, y_train.shape

((60000, 28, 28), (60000,))

In [36]:
x_train = x_train.reshape((60000, 784))
x_train.shape

(60000, 784)

In [48]:
def one_hot_encode(y, num_classes=10):
    one_hot = np.zeros((y.shape[0], num_classes))
    one_hot[np.arange(y.shape[0]), y] = 1
    return one_hot

In [51]:
y_train = one_hot_encode(y_train)

In [57]:
model = Model()

In [58]:
model.add(Dense(784, 16))
model.add(ReLU())
model.add(Dense(16, 16))
model.add(ReLU())
model.add(Dense(16, 10))
loss = CrossEntropywithSoftmax()

In [61]:
model.fit(x_train, y_train, epochs=10, loss_fn=loss, lr = 0.1)

Epoch 1/10 | Loss: 2.301857
Epoch 2/10 | Loss: 2.301842
Epoch 3/10 | Loss: 2.301828
Epoch 4/10 | Loss: 2.301813
Epoch 5/10 | Loss: 2.301799
Epoch 6/10 | Loss: 2.301786
Epoch 7/10 | Loss: 2.301772
Epoch 8/10 | Loss: 2.301759
Epoch 9/10 | Loss: 2.301746
Epoch 10/10 | Loss: 2.301734


In [52]:
x = np.array([[1.0, 2.0]])
y_true = np.array([[1.0]])

In [53]:
w1 = np.array([[-0.25091976,  0.90142861,  0.46398788,  0.19731697],
       [-0.68796272, -0.68801096, -0.88383278,  0.73235229]])
b1 = np.array([[0, 0, 0, 0]])

In [54]:
z1 = np.dot(x, w1) + b1
z1

array([[-1.6268452 , -0.47459331, -1.30367768,  1.66202155]])

In [55]:
a1 = np.maximum(0, z1)
a1

array([[0.        , 0.        , 0.        , 1.66202155]])

In [56]:
w2 = np.array([[ 0.18722862,  0.38527555, -0.88770502],
       [ 0.87010397,  0.61556416, -0.53264447],
       [-0.58914568, -0.58622094, -0.36247293],
       [ 0.04584   , -0.12601334, -0.38656852]])
b2 = np.array([[0, 0, 0]])

In [57]:
z2 = np.dot(a1, w2) + b2
z2

array([[ 0.07618707, -0.20943689, -0.64248521]])

In [58]:
a2 = np.maximum(0, z2)
a2

array([[0.07618707, 0.        , 0.        ]])

In [59]:
w3 = np.array([[ 0.27398252],
       [-0.88305609],
       [-0.50913955]])
b3 = np.array([[0]])

In [61]:
z3 = np.dot(a2, w3) + b3
z3

array([[0.02087392]])

In [70]:
a3 = 1 / (1 + np.exp(-z3))
a3

array([[0.50521829]])

In [71]:
loss = 0.5 * np.mean((a3 - y_true) ** 2)
loss

0.12240446941668841

In [93]:
loss_wrt_a3 = a3 - y_true
loss_wrt_a3

array([[-0.49478171]])

In [94]:
a3_wrt_z3 = loss_wrt_a3 * a3 * (1 - a3)
a3_wrt_z3

array([[-0.12368195]])

In [97]:
z3_wrt_a2 = np.dot(a2.T, a3_wrt_z3)
z3_wrt_a2

array([[-0.00942297],
       [ 0.        ],
       [ 0.        ]])

In [100]:
z3_wrt_a2_we = np.dot(a3_wrt_z3, w3.T)
z3_wrt_a2_we

array([[-0.03388669,  0.1092181 ,  0.06297137]])

In [103]:
a2_wrt_z2 = np.where(z2>0, 1, 0) * z3_wrt_a2_we
a2_wrt_z2

array([[-0.03388669,  0.        ,  0.        ]])

In [106]:
z2_wrt_a1 = np.dot(a1.T, a2_wrt_z2)
z2_wrt_a1

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [-0.05632041,  0.        ,  0.        ]])

In [108]:
z2_wrt_a1_we = np.dot(a2_wrt_z2, w2.T)
z2_wrt_a1_we

array([[-0.00634456, -0.02948495,  0.0199642 , -0.00155337]])

In [110]:
a1_wrt_z1 = np.where(z1>0, 1, 0) * z2_wrt_a1_we
a1_wrt_z1

array([[-0.        , -0.        ,  0.        , -0.00155337]])

In [112]:
z1_wrt_w1 = np.dot(x.T, a1_wrt_z1)
z1_wrt_w1

array([[ 0.        ,  0.        ,  0.        , -0.00155337],
       [ 0.        ,  0.        ,  0.        , -0.00310673]])