# Step 1: Define the required functions

In [2]:
import numpy as np

# Sigmoid activation and its derivative
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_derivative(Z):
    sig = sigmoid(Z)
    return sig * (1 - sig)

# ReLU activation and its derivative
def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(float)

# Loss function (Binary Cross-Entropy)
def binary_cross_entropy_loss(y, y_hat):
    m = y.shape[0]
    return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / m

def binary_cross_entropy_loss_derivative(y, y_hat):
    return -(y / y_hat) + ((1 - y) / (1 - y_hat))


# Define Architecture


# Neural Network Architecture Overview

| Layer           | Number of Neurons |
|------------------|-------------------|
| Input Layer      | 2                |
| Hidden Layer 1   | 3                |
| Hidden Layer 2   | 2                |
| Output Layer     | 1                |

## Total Number of Parameters
To compute the total parameters:
- **Weights**:
  - Layer 1: $ 2 \times 3 $
  - Layer 2: $ 3 \times 2 $
  - Layer 3: $ 2 \times 1 $
- **Biases**:
  - Layer 1: $3$
  - Layer 2: $ 2 $
  - Layer 3: $ 1 $


# Step 2: Initialize weights and biases

In [3]:
# Initialize parameters (weights and biases)
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(42)  # For reproducibility
    W1 = np.random.randn(input_size, hidden1_size) * 0.01
    b1 = np.zeros((1, hidden1_size))
    W2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
    b2 = np.zeros((1, hidden2_size))
    W3 = np.random.randn(hidden2_size, output_size) * 0.01
    b3 = np.zeros((1, output_size))
    return W1, b1, W2, b2, W3, b3


# Step 3: Forward propagation

In [4]:
def forward_propagation(X, W1, b1, W2, b2, W3, b3):
    # Layer 1
    Z1 = np.dot(X, W1) + b1
    H1 = relu(Z1)  # Use relu or sigmoid here

    # Layer 2
    Z2 = np.dot(H1, W2) + b2
    H2 = relu(Z2)  # Use relu or sigmoid here

    # Layer 3 (Output layer)
    Z3 = np.dot(H2, W3) + b3
    y_hat = sigmoid(Z3)  # Use sigmoid for binary classification

    cache = (Z1, H1, Z2, H2, Z3, y_hat)
    return y_hat, cache   # Cache Stores called Memoization


# Step 4: Backward propagation

In [5]:
def backward_propagation(X, y, cache, W1, W2, W3):
    Z1, H1, Z2, H2, Z3, y_hat = cache
    m = y.shape[0]

    # Output layer gradients
    dL_dy_hat = binary_cross_entropy_loss_derivative(y, y_hat)  # dL/dy_hat
    dy_hat_dZ3 = sigmoid_derivative(Z3)  # dy_hat/dZ3
    dZ3 = dL_dy_hat * dy_hat_dZ3  # dL/dZ3
    dW3 = np.dot(H2.T, dZ3) / m  # dL/dW3
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m  # dL/db3 Computes the average gradient of the bias over all examples in the batch

    # Hidden layer 2 gradients
    dZ3_dH2 = W3
    dH2 = np.dot(dZ3, dZ3_dH2.T)  # dL/dH2
    dH2_dZ2 = relu_derivative(Z2)  # dH2/dZ2
    dZ2 = dH2 * dH2_dZ2  # dL/dZ2
    dW2 = np.dot(H1.T, dZ2) / m  # dL/dW2
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m  # dL/db2

    # Hidden layer 1 gradients
    dZ2_dH1 = W2
    dH1 = np.dot(dZ2, dZ2_dH1.T)  # dL/dH1
    dH1_dZ1 = relu_derivative(Z1)  # dH1/dZ1
    dZ1 = dH1 * dH1_dZ1  # dL/dZ1
    dW1 = np.dot(X.T, dZ1) / m  # dL/dW1
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # dL/db1

    gradients = (dW1, db1, dW2, db2, dW3, db3)
    return gradients


# Step 5: Update weights and biases

In [6]:
def update_parameters(W1, b1, W2, b2, W3, b3, gradients, learning_rate):
    dW1, db1, dW2, db2, dW3, db3 = gradients
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W3 -= learning_rate * dW3
    b3 -= learning_rate * db3
    return W1, b1, W2, b2, W3, b3


# Step 6: Training the model

In [7]:
def train(X, y, input_size, hidden1_size, hidden2_size, output_size, learning_rate, num_epochs):
    # Initialize parameters
    W1, b1, W2, b2, W3, b3 = initialize_parameters(input_size, hidden1_size, hidden2_size, output_size)

    for epoch in range(num_epochs):
        # Forward propagation
        y_hat, cache = forward_propagation(X, W1, b1, W2, b2, W3, b3)

        # Compute loss
        loss = binary_cross_entropy_loss(y, y_hat)

        # Backward propagation
        gradients = backward_propagation(X, y, cache, W1, W2, W3)

        # Update parameters
        W1, b1, W2, b2, W3, b3 = update_parameters(W1, b1, W2, b2, W3, b3, gradients, learning_rate)

        # Print loss every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

    return W1, b1, W2, b2, W3, b3


# Example Usage

In [8]:
# Example inputs
X = np.random.rand(100, 3)  # 100 examples, 3 features
y = np.random.randint(0, 2, size=(100, 1))  # Binary labels (0 or 1)

# Hyperparameters
input_size = 3
hidden1_size = 4
hidden2_size = 4
output_size = 1
learning_rate = 0.01
num_epochs = 1000

# Train the model
train(X, y, input_size, hidden1_size, hidden2_size, output_size, learning_rate, num_epochs)


Epoch 0, Loss: 0.693147193959094
Epoch 100, Loss: 0.6918868361872207
Epoch 200, Loss: 0.6911227604659338
Epoch 300, Loss: 0.6906593902962515
Epoch 400, Loss: 0.690378271957255
Epoch 500, Loss: 0.6902076564962265
Epoch 600, Loss: 0.6901040706560907
Epoch 700, Loss: 0.6900411613590189
Epoch 800, Loss: 0.6900029457981876
Epoch 900, Loss: 0.6899797260751569


(array([[ 0.0049661 , -0.00137855,  0.00647645,  0.01523046],
        [-0.00234047, -0.002331  ,  0.01579848,  0.00767192],
        [-0.00469609,  0.00543673, -0.00464249, -0.00465411]]),
 array([[ 4.28122513e-06,  1.69158433e-05, -6.86745155e-07,
          3.70247739e-07]]),
 array([[ 0.00241785, -0.01913296, -0.01724918, -0.00562288],
        [-0.01013082,  0.00314415, -0.00908024, -0.01412304],
        [ 0.01466473, -0.00226448,  0.00067528, -0.01424748],
        [-0.00543889,  0.00110547, -0.01150994,  0.00375698]]),
 array([[-6.90029424e-05,  9.85695080e-06,  0.00000000e+00,
          0.00000000e+00]]),
 array([[-0.00602591],
        [-0.00292356],
        [-0.00601707],
        [ 0.01852278]]),
 array([[0.14707431]]))

In [9]:
import numpy as np

# Initialize parameters (weights and biases)
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(42)  # For reproducibility
    W1 = np.random.randn(input_size, hidden1_size) * 0.01
    b1 = np.zeros((1, hidden1_size))
    W2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
    b2 = np.zeros((1, hidden2_size))
    W3 = np.random.randn(hidden2_size, output_size) * 0.01
    b3 = np.zeros((1, output_size))
    return W1, b1, W2, b2, W3, b3

# Sigmoid activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Forward propagation
def forward_propagation(X, W1, b1, W2, b2, W3, b3):
    Z1 = np.dot(X, W1) + b1
    A1 = np.maximum(0, Z1)  # ReLU activation
    Z2 = np.dot(A1, W2) + b2
    A2 = np.maximum(0, Z2)  # ReLU activation
    Z3 = np.dot(A2, W3) + b3
    A3 = sigmoid(Z3)  # Sigmoid activation for output layer
    return Z1, A1, Z2, A2, Z3, A3

# Backward propagation
def backward_propagation(X, y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3):
    m = X.shape[0]
    
    dZ3 = A3 - y
    dW3 = np.dot(A2.T, dZ3) / m
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m

    dA2 = np.dot(dZ3, W3.T)
    dZ2 = dA2 * (Z2 > 0)  # Derivative of ReLU
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * (Z1 > 0)  # Derivative of ReLU
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2, dW3, db3

# Compute binary cross-entropy loss
def compute_loss(y, A3):
    m = y.shape[0]
    loss = -np.mean(y * np.log(A3 + 1e-8) + (1 - y) * np.log(1 - A3 + 1e-8))  # Adding small value for numerical stability
    return loss

# Compute accuracy
def compute_accuracy(y, A3):
    predictions = A3 >= 0.5
    accuracy = np.mean(predictions == y)
    return accuracy

# Training function
def train(X, y, input_size, hidden1_size, hidden2_size, output_size, learning_rate, num_epochs):
    W1, b1, W2, b2, W3, b3 = initialize_parameters(input_size, hidden1_size, hidden2_size, output_size)

    for epoch in range(num_epochs):
        # Forward propagation
        Z1, A1, Z2, A2, Z3, A3 = forward_propagation(X, W1, b1, W2, b2, W3, b3)

        # Compute loss
        loss = compute_loss(y, A3)

        # Compute accuracy
        accuracy = compute_accuracy(y, A3)

        # Backward propagation
        dW1, db1, dW2, db2, dW3, db3 = backward_propagation(X, y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3)

        # Update parameters
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W3 -= learning_rate * dW3
        b3 -= learning_rate * db3

        # Print loss and accuracy every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {accuracy:.4f}")

    print("Training complete.")
    return W1, b1, W2, b2, W3, b3

# Example inputs
X = np.random.rand(100, 3)  # 100 examples, 3 features
y = np.random.randint(0, 2, size=(100, 1))  # Binary labels (0 or 1)

# Hyperparameters
input_size = 3
hidden1_size = 4
hidden2_size = 4
output_size = 1
learning_rate = 0.01
num_epochs = 1000

# Train the model
train(X, y, input_size, hidden1_size, hidden2_size, output_size, learning_rate, num_epochs)


Epoch 0: Loss = 0.6931, Accuracy = 0.4900
Epoch 100: Loss = 0.6928, Accuracy = 0.5200
Epoch 200: Loss = 0.6926, Accuracy = 0.5200
Epoch 300: Loss = 0.6925, Accuracy = 0.5200
Epoch 400: Loss = 0.6925, Accuracy = 0.5200
Epoch 500: Loss = 0.6924, Accuracy = 0.5200
Epoch 600: Loss = 0.6924, Accuracy = 0.5200
Epoch 700: Loss = 0.6924, Accuracy = 0.5200
Epoch 800: Loss = 0.6924, Accuracy = 0.5200
Epoch 900: Loss = 0.6924, Accuracy = 0.5200
Training complete.


(array([[ 0.00496819, -0.00137718,  0.00648758,  0.01522632],
        [-0.00234026, -0.00233054,  0.01579971,  0.00767151],
        [-0.00469466,  0.00544062, -0.0046459 , -0.00465302]]),
 array([[ 2.03017520e-06,  2.33134993e-05,  2.72688074e-07,
         -1.13023681e-07]]),
 array([[ 0.00242039, -0.0191328 , -0.01724918, -0.00562288],
        [-0.01013315,  0.00314249, -0.00908024, -0.01412304],
        [ 0.01467303, -0.00225814,  0.00067528, -0.01424748],
        [-0.00542515,  0.00110854, -0.01150994,  0.00375698]]),
 array([[ 1.63231662e-05, -7.30879115e-06,  0.00000000e+00,
          0.00000000e+00]]),
 array([[-0.00603827],
        [-0.002917  ],
        [-0.00601707],
        [ 0.01852278]]),
 array([[0.07347499]]))

# Backwordpropogation with Memoization

In [None]:
def backward_propagation(y, cache, W1, W2, W3):
    # Retrieve cached values
    X = cache["X"]
    Z1 = cache["Z1"]
    H1 = cache["H1"]
    Z2 = cache["Z2"]
    H2 = cache["H2"]
    Z3 = cache["Z3"]
    y_hat = cache["y_hat"]

    m = y.shape[0]  # Number of examples

    # Gradients for Output Layer
    dL_dy_hat = binary_cross_entropy_loss_derivative(y, y_hat)  # dL/dy_hat
    dy_hat_dZ3 = sigmoid_derivative(Z3)                        # dy_hat/dZ3
    dZ3 = dL_dy_hat * dy_hat_dZ3                               # dL/dZ3
    dW3 = np.dot(H2.T, dZ3) / m                                # dL/dW3
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m               # dL/db3

    # Gradients for Hidden Layer 2
    dZ3_dH2 = W3
    dH2 = np.dot(dZ3, dZ3_dH2.T)                               # dL/dH2
    dH2_dZ2 = relu_derivative(Z2)                              # dH2/dZ2
    dZ2 = dH2 * dH2_dZ2                                        # dL/dZ2
    dW2 = np.dot(H1.T, dZ2) / m                                # dL/dW2
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m               # dL/db2

    # Gradients for Hidden Layer 1
    dZ2_dH1 = W2
    dH1 = np.dot(dZ2, dZ2_dH1.T)                               # dL/dH1
    dH1_dZ1 = relu_derivative(Z1)                              # dH1/dZ1
    dZ1 = dH1 * dH1_dZ1                                        # dL/dZ1
    dW1 = np.dot(X.T, dZ1) / m                                 # dL/dW1
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m               # dL/db1

    # Store gradients in a dictionary
    gradients = {
        "dW1": dW1,
        "db1": db1,
        "dW2": dW2,
        "db2": db2,
        "dW3": dW3,
        "db3": db3
    }
    return gradients


In [None]:
gradients = backward_propagation(y, cache, W1, W2, W3)
print("Gradients for W1:", gradients["dW1"])
print("Gradients for b1:", gradients["db1"])


In [None]:
def update_parameters(W1, b1, W2, b2, W3, b3, gradients, learning_rate):
    # Retrieve gradients
    dW1, db1 = gradients["dW1"], gradients["db1"]
    dW2, db2 = gradients["dW2"], gradients["db2"]
    dW3, db3 = gradients["dW3"], gradients["db3"]

    # Update parameters
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W3 -= learning_rate * dW3
    b3 -= learning_rate * db3

    return W1, b1, W2, b2, W3, b3
