In [1]:
import numpy as np
from typing import Union

In [2]:
Array = Union[float, np.ndarray]

In [None]:

class ReLU:

    """
    ReLU Activation Function

    Forward:
        f(x) = max(0, x)

    Backward:
        f'(x) = 1 if x > 0 else 0
    """
    def forward(self, x: Array) -> Array:
        """Applies ReLU Activation function to input X."""

        self.input = x
        return np.maximum(0, x)

    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of ReLU with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (self.input > 0)

In [25]:
class Sigmoid:
    """
    Sigmoid Activation Function

    Forward:
        f(x) = 1 / (1 + e^(-x))

    Backward:
        f'(x) = f(x) * (1 - f(x))
    """

    def forward(self, x: Array) -> Array:
        """Applies Sigmoid Activation function to input X."""
        
        self.output = 1 / (1 + np.exp(-x))
        return self.output
    
    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of Sigmoid with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (self.output * (1 - self.output))

In [26]:
class Tanh:
    """
    Tanh Activation Function

    Forward:
        tanh(x) = (e^x - e^-x) / (e^x + e^-x)

    Backward:
        d/dx tanh(x) = 1 - tanh(x)^2
    """

    def forward(self, x: Array) -> Array:
        """Applies Tanh Activation function to input X."""
        
        self.output = np.tanh(x)
        return self.output
        
    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of Tanh with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (1 - (self.output ** 2))

In [3]:
class LeakyReLU:
    """
    LeakyReLU Activation Function

    Forward:
        f(x) = max(0, αx)

    Backward:
        f'(x) = 1 if x >= 0 else α
    """

    def __init__(self, alpha: float = 0.01):
        self.alpha = alpha
        
    def forward(self, x: Array) -> Array:
        """Applies LeakyReLU Activation function to input X."""
        self.input = x
        return np.where(x >= 0, x, self.alpha * x)

    def backward(self, grad_output: Array) -> Array:
        """Computes the gradient of LeakyReLU with respect to input x.
        grad_output is the gradient flowing from the next layer."""

        return grad_output * (np.where(self.input >= 0, 1, self.alpha))

In [28]:
class Softmax:
    """
    Softmax Activation Function

    Forward:
        softmax(z_i) = e^(z_i) / sum(e^(z_j))

    Backward:
        Returns gradient of softmax output w.r.t input z
        Usually combined with CrossEntropy for efficient backprop
    """

    def forward(self, x: Array) -> Array:
        
        """Applies Softmax Activation function to input X."""

        x = np.atleast_2d(x)
        x_shifted = x - np.max(x, axis=1, keepdims=True)
        exps = np.exp(x_shifted)
        self.output = exps / np.sum(exps, axis=1, keepdims=True)
        return self.output

    def backward(self, grad_output: Array) -> Array:
        """
        Compute softmax derivative using the Jacobian matrix.
        Rarely used alone — often fused with cross-entropy for speed.
        """
        batch_size, classes = grad_output.shape
        grad_input = np.ones_like(grad_output)

        for b in range(batch_size):
            y = self.output[b].reshape(-1, 1)
            jacobian = np.diagflat(y) - np.dot(y, y.T)
            grad_input[b] = np.dot(jacobian, grad_output[b])

        return grad_input

In [4]:
class CrossEntropywithSoftmax:
    
    def __init__(self):
        self.logit: Union[np.ndarray, None] = None
        self.probs: Union[np.ndarray, None] = None
        self.target: Union[np.ndarray, None] = None


    def forward(self, x: Array, target: Array) -> float:

        self.logit = x
        self.target = target

        x = np.atleast_2d(x)
        shifted_x = x - np.max(x, axis=1, keepdims=True)
        exps = np.exp(shifted_x)
        self.probs = exps / np.sum(exps, axis=1, keepdims=True)

        if target.ndim == 1:
            one_hot = np.zeros_like(x)
            one_hot[np.arange(len(target)), target] = 1
            self.target = one_hot

        eps = 1e-10
        loss_prob = np.log(np.clip(self.probs, eps, 1 - eps))
        loss = -np.sum(loss_prob * self.target) / x.shape[0]
        return loss

    def backward(self):

        return (self.probs - self.target) / self.logit.shape[0]

In [5]:
class Dense:

    def __init__(self, input_dim: int, output_dim: int):
        """
        Initialize weights and bias.
        Weights are small random values.
        Bias is initialized to zero.
        """

        self.input_dim = input_dim
        self.output_dim = output_dim

        limit = np.sqrt(6 / (input_dim + output_dim))
        self.w = np.random.uniform(-limit, limit, (output_dim, input_dim))
        self.b = np.zeros((1, output_dim))

        self.input: Union[np.ndarray, None] = None
        self.dw: Union[np.ndarray, None] = None
        self.db: Union[np.ndarray, None] = None

    def forward(self, x: Array) -> Array:
        """
        Forward pass: output = xW^T + b
        """
        self.input = np.atleast_2d(x)
        return np.dot(self.input, self.w.T) + self.b

    def backward(self, grad_output: Array) -> Array:
        """
        Backward pass:
        - Compute gradients for weights and bias
        - Return gradient w.r.t input for previous layer
        """
        grad_output = np.atleast_2d(grad_output)

        self.dw = np.dot(grad_output.T, self.input)
        self.db = np.sum(grad_output, axis=0, keepdims=True)

        grad_input = np.dot(grad_output, self.w)

        return grad_input

    def update(self, lr: float = 0.01):
        """
        Update weights and biases using gradient descent
        """
        self.w -= lr * self.dw
        self.b -= lr * self.db

In [6]:
class Model:

    def __init__(self):
        self.layers = []

    def add(self, layer):
        """Add a new layer (Dense, Activation) to the model."""
        self.layers.append(layer)

    def forward(self, x):
        """Forward pass through all layers"""
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad_output):
        """Backward pass through all layers"""
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output)
        return grad_output

    def fit(self, x, y, epochs, loss_fn, lr = 0.01, batch_size = 64):
        n_samples = x.shape[0]
        print(n_samples)
        for epoch in range(epochs):
            
            indices = np.random.permutation(x.shape[0])
            x_shuffled = x[indices]
            y_shuffled = y[indices]
            
            total_loss = 0
            correct = 0
            
            for batch in range(0, n_samples, batch_size):
                
                xb = x_shuffled[batch: batch + batch_size]
                yb = y_shuffled[batch: batch + batch_size]
                
                output = self.forward(xb)

                loss = loss_fn.forward(output, yb)

                loss_grad = loss_fn.backward()
                self.backward(loss_grad)

                for layer in self.layers:
                    if hasattr(layer, "update"):
                        layer.update(lr)
                        
                        
                total_loss += loss * xb.shape[0]
                preds = np.argmax(output, axis=1)
                labels = yb if yb.ndim == 1 else np.argmax(yb, axis=1)
                correct += np.sum(preds == labels)
                
            avg_loss = total_loss / n_samples
            acc = correct / n_samples
            
            print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.6f}  | acc: {acc:.4f}")

In [32]:
class MSELoss:
    
    def forward(self, predicted, True_labels):
        predicted = np.atleast_2d(predicted)
        self.predicted = predicted
        self.True_labels = True_labels
        return 0.5 * np.mean((predicted - True_labels) ** 2)
        
    def backward(self):
        return (self.predicted - self.True_labels) / self.True_labels.shape[0]

In [33]:
# class SGD:

#     def __init__(self, lr: float = 0.01):
#         self.lr = lr

#     def step(self, layers):
#         for layer in layers:
#             if hasattr(layer, "w"):
#                 layer.w -= self.lr * layer.dw
#                 layer.b -= self.lr * layer.db

In [34]:
x = np.random.uniform(-1, 1, (3, 5))
x

array([[ 0.71084982,  0.58914118,  0.57905041, -0.30811098,  0.4095885 ],
       [ 0.35947451,  0.66453414,  0.63998865, -0.79460136,  0.02620583],
       [ 0.83368385, -0.40343996,  0.52606912,  0.37074497, -0.46333073]])

In [35]:
y_true = np.array([
    [0, 1],
    [1, 0],
    [1, 0]
])

In [16]:
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784).astype('float32') / 255.0


In [17]:
model = Model()

In [18]:
model.add(Dense(784, 128))
model.add(LeakyReLU())
model.add(Dense(128, 64))
model.add(LeakyReLU())
model.add(Dense(64, 10))
loss = CrossEntropywithSoftmax()

In [19]:
x_train.shape[0]

60000

In [20]:
model.fit(x_train, y_train, epochs=100, loss_fn=loss, lr = 0.1)

60000
Epoch 1/100 | Loss: 0.346091  | acc: 0.8990
Epoch 2/100 | Loss: 0.155837  | acc: 0.9536
Epoch 3/100 | Loss: 0.110974  | acc: 0.9671
Epoch 4/100 | Loss: 0.085938  | acc: 0.9738
Epoch 5/100 | Loss: 0.069203  | acc: 0.9790
Epoch 6/100 | Loss: 0.056692  | acc: 0.9825
Epoch 7/100 | Loss: 0.047229  | acc: 0.9865
Epoch 8/100 | Loss: 0.040984  | acc: 0.9878
Epoch 9/100 | Loss: 0.034174  | acc: 0.9897
Epoch 10/100 | Loss: 0.029773  | acc: 0.9911
Epoch 11/100 | Loss: 0.024627  | acc: 0.9928
Epoch 12/100 | Loss: 0.020266  | acc: 0.9947
Epoch 13/100 | Loss: 0.017755  | acc: 0.9951
Epoch 14/100 | Loss: 0.015015  | acc: 0.9959
Epoch 15/100 | Loss: 0.012250  | acc: 0.9971
Epoch 16/100 | Loss: 0.010086  | acc: 0.9978
Epoch 17/100 | Loss: 0.008503  | acc: 0.9985
Epoch 18/100 | Loss: 0.006740  | acc: 0.9988
Epoch 19/100 | Loss: 0.005673  | acc: 0.9991
Epoch 20/100 | Loss: 0.004558  | acc: 0.9994
Epoch 21/100 | Loss: 0.003633  | acc: 0.9997
Epoch 22/100 | Loss: 0.003016  | acc: 0.9999
Epoch 23/100 

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, ReLU, Softmax
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load MNIST data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess data
x_train = x_train.reshape(-1, 28*28).astype('float32') / 255.0
x_test = x_test.reshape(-1, 28*28).astype('float32') / 255.0

# # Model definition
# model = Sequential([
#     Dense(16, input_shape=(784,)),
#     ReLU(),
#     Dense(16),
#     ReLU(),
#     Dense(10),
#     Softmax()
# ])

# # Compile model
# model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',  # integer labels
#     metrics=['accuracy']
# )

# # Train model
# model.fit(
#     x_train, y_train,
#     validation_data=(x_test, y_test),
#     epochs=10,
#     batch_size=32
# )

# # Evaluate model
# loss, accuracy = model.evaluate(x_test, y_test)
# print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")


In [None]:
np.random.permutation(60000)

In [52]:
x = np.array([[1.0, 2.0]])
y_true = np.array([[1.0]])

In [53]:
w1 = np.array([[-0.25091976,  0.90142861,  0.46398788,  0.19731697],
       [-0.68796272, -0.68801096, -0.88383278,  0.73235229]])
b1 = np.array([[0, 0, 0, 0]])

In [54]:
z1 = np.dot(x, w1) + b1
z1

array([[-1.6268452 , -0.47459331, -1.30367768,  1.66202155]])

In [55]:
a1 = np.maximum(0, z1)
a1

array([[0.        , 0.        , 0.        , 1.66202155]])

In [56]:
w2 = np.array([[ 0.18722862,  0.38527555, -0.88770502],
       [ 0.87010397,  0.61556416, -0.53264447],
       [-0.58914568, -0.58622094, -0.36247293],
       [ 0.04584   , -0.12601334, -0.38656852]])
b2 = np.array([[0, 0, 0]])

In [57]:
z2 = np.dot(a1, w2) + b2
z2

array([[ 0.07618707, -0.20943689, -0.64248521]])

In [58]:
a2 = np.maximum(0, z2)
a2

array([[0.07618707, 0.        , 0.        ]])

In [59]:
w3 = np.array([[ 0.27398252],
       [-0.88305609],
       [-0.50913955]])
b3 = np.array([[0]])

In [61]:
z3 = np.dot(a2, w3) + b3
z3

array([[0.02087392]])

In [70]:
a3 = 1 / (1 + np.exp(-z3))
a3

array([[0.50521829]])

In [71]:
loss = 0.5 * np.mean((a3 - y_true) ** 2)
loss

0.12240446941668841

In [93]:
loss_wrt_a3 = a3 - y_true
loss_wrt_a3

array([[-0.49478171]])

In [94]:
a3_wrt_z3 = loss_wrt_a3 * a3 * (1 - a3)
a3_wrt_z3

array([[-0.12368195]])

In [97]:
z3_wrt_a2 = np.dot(a2.T, a3_wrt_z3)
z3_wrt_a2

array([[-0.00942297],
       [ 0.        ],
       [ 0.        ]])

In [100]:
z3_wrt_a2_we = np.dot(a3_wrt_z3, w3.T)
z3_wrt_a2_we

array([[-0.03388669,  0.1092181 ,  0.06297137]])

In [103]:
a2_wrt_z2 = np.where(z2>0, 1, 0) * z3_wrt_a2_we
a2_wrt_z2

array([[-0.03388669,  0.        ,  0.        ]])

In [106]:
z2_wrt_a1 = np.dot(a1.T, a2_wrt_z2)
z2_wrt_a1

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [-0.05632041,  0.        ,  0.        ]])

In [108]:
z2_wrt_a1_we = np.dot(a2_wrt_z2, w2.T)
z2_wrt_a1_we

array([[-0.00634456, -0.02948495,  0.0199642 , -0.00155337]])

In [110]:
a1_wrt_z1 = np.where(z1>0, 1, 0) * z2_wrt_a1_we
a1_wrt_z1

array([[-0.        , -0.        ,  0.        , -0.00155337]])

In [112]:
z1_wrt_w1 = np.dot(x.T, a1_wrt_z1)
z1_wrt_w1

array([[ 0.        ,  0.        ,  0.        , -0.00155337],
       [ 0.        ,  0.        ,  0.        , -0.00310673]])