## Understanding the Neural Network
A neural network consists of:

* Layers: Each layer has weights, biases, and an activation function.
* Forward Propagation: Computes the output by passing input through the layers.
* Loss Function: Measures the error between predicted and true outputs (e.g., cross-entropy for classification, MSE for regression).
* Backpropagation: Computes gradients of the loss with respect to weights and biases.
* Gradient Descent: Updates weights and biases to minimize the loss.

We’ll implement a fully connected (dense) neural network with customizable layers and activation functions.

---
## Implementing the Code
File 1: `layers.py`
This file defines a Layer class to represent a single layer in the network.

### Explanation:

The Layer class initializes weights and biases randomly.
* forward: Computes the output of the layer (linear transformation + activation).
* backward: Computes gradients and updates weights/biases using the chain rule.
* Activation functions are temporarily here; we’ll move them to activations.py.

In [None]:
import numpy as np

class Layer:
    def __init__(self, input_size, output_size, activation=None):
        # Initialize weights and biases with small random values
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.biases = np.zeros((1, output_size))
        self.activation = activation  # Activation function (e.g., 'relu', 'sigmoid', 'softmax')
        # Store intermediate values for backpropagation
        self.input = None
        self.z = None  # Pre-activation output
        self.a = None  # Post-activation output

    def forward(self, X):
        # Forward pass: X is the input (batch_size, input_size)
        self.input = X
        self.z = np.dot(X, self.weights) + self.biases  # Linear transformation
        if self.activation == 'relu':
            self.a = relu(self.z)
        elif self.activation == 'sigmoid':
            self.a = sigmoid(self.z)
        elif self.activation == 'softmax':
            self.a = softmax(self.z)
        else:
            self.a = self.z  # No activation
        return self.a

    def backward(self, delta, learning_rate):
        # Backward pass: delta is the gradient from the next layer
        if self.activation == 'relu':
            delta = delta * relu_derivative(self.z)
        elif self.activation == 'sigmoid':
            delta = delta * sigmoid_derivative(self.z)
        elif self.activation == 'softmax':
            # Softmax derivative is handled in the loss function (cross-entropy)
            pass

        # Compute gradients
        dW = np.dot(self.input.T, delta)  # Gradient w.r.t weights
        db = np.sum(delta, axis=0, keepdims=True)  # Gradient w.r.t biases
        dX = np.dot(delta, self.weights.T)  # Gradient w.r.t input (for previous layer)

        # Update weights and biases
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db
        return dX

# Activation functions (will be moved to activations.py later)
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Subtract max for stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [21]:
import numpy as np
weights = np.random.randn(5, 3) // 2
bias = np.ones((1, 3))
X = np.random.randn(10, 5) // 2 # (batch_size, input_size)
z = np.dot(X, weights) + bias # Linear transformation
print(f"weights: {weights}")
print(weights.shape)
print(f"\nbias: {bias}")
print(bias.shape)
print(f"\nX: {X}")
print(X.shape)
print(f"\nz: {z}")
print(z.shape)

weights: [[ 0. -1. -1.]
 [-1.  0.  0.]
 [-1. -1.  0.]
 [-1. -1.  0.]
 [-1.  0. -1.]]
(5, 3)

bias: [[1. 1. 1.]]
(1, 3)

X: [[ 0.  0. -1. -1.  1.]
 [-1.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0. -1. -1.  0.]
 [ 0.  0. -1.  0. -2.]
 [ 0.  0.  0.  0.  0.]
 [-1.  0. -1. -1.  0.]
 [ 0. -1. -1. -1. -1.]
 [ 0. -2.  0.  0.  0.]
 [-1.  0. -1.  0. -1.]]
(10, 5)

z: [[2. 3. 0.]
 [2. 3. 2.]
 [1. 1. 1.]
 [3. 3. 1.]
 [4. 2. 3.]
 [1. 1. 1.]
 [3. 4. 2.]
 [5. 3. 2.]
 [3. 1. 1.]
 [3. 3. 3.]]
(10, 3)


In [24]:
def relu_derivative(z):
    return np.where(z > 0, 1, 0)

delta = np.array([1, 2, 3])
z = np.array([-1, 0, 1])
delta = delta * relu_derivative(z)
print(relu_derivative(z))
print(delta)

db = np.sum(delta, axis=0, keepdims=True)
print(db)

[0 0 1]
[0 0 3]
[3]


In [7]:
import numpy as np
x = np.array([[1, 2, 3]])
x - np.max(x, axis=1, keepdims=True)

array([[-2, -1,  0]])