<a href="https://colab.research.google.com/github/ronbalanay/MAT-422/blob/main/MAT422_HW_3_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

3.7.1 Mathematical formulation

We initialize weights and biases for each layer and propagate the input values through the network by first calculating z(l)=W(l)⋅a(l−1)+b(l) at each layer. After computing z(l), we apply the activation function σ(z(l)) to obtain a(l), the output for that layer. Finally, we update the activations from the previous layer to pass them to the next layer in sequence.

In [None]:
import numpy as np

# Define the activation function (e.g., sigmoid) and its derivative
def activation_function(z):
    return 1 / (1 + np.exp(-z))  # Sigmoid activation

# Define parameters for a simple neural network with two layers
# Assume these are given or randomly initialized for a sample calculation
layer_sizes = [2, 3, 1]  # Example sizes: 2 inputs, 3 nodes in hidden layer, 1 output
np.random.seed(0)  # For reproducible results

# Initialize weights and biases based on the layer sizes
weights = [np.random.randn(layer_sizes[l], layer_sizes[l-1]) for l in range(1, len(layer_sizes))]
biases = [np.random.randn(size, 1) for size in layer_sizes[1:]]

# Sample input (column vector)
a_prev = np.array([[0.5], [0.8]])  # Example input to the network

# Forward pass through the layers based on the formulation
for l, (W, b) in enumerate(zip(weights, biases), start=1):
    # Compute z^(l) = W^(l) * a^(l-1) + b^(l)
    z = np.dot(W, a_prev) + b
    # Compute a^(l) = σ(z^(l))
    a = activation_function(z)
    # Print layer information
    print(f"Layer {l}:")
    print(f"z^{l} =\n{z}")
    print(f"a^{l} =\n{a}\n")
    # Update a_prev for the next layer
    a_prev = a


Layer 1:
z^1 =
[[1.61275044]
 [2.42612712]
 [1.6062302 ]]
a^1 =
[[0.8337929 ]
 [0.91879805]
 [0.83288734]]

Layer 2:
z^2 =
[[1.32817832]]
a^2 =
[[0.79053915]]



3.7.2. Activation functions
  
Step Function:

    Outputs 1 for values ≥ 0 and 0 otherwise.

ReLU:

    Outputs input values if they are positive, otherwise 0.

Sigmoid:

    Maps inputs to values between 0 and 1, useful for binary probabilities.

Softmax:

    Normalizes the input to a probability distribution, summing up to 1.

In [None]:
import numpy as np

# Step Function
def step_function(x):
    return np.where(x >= 0, 1, 0)

# ReLU (Rectified Linear Unit) Function
def relu(x):
    return np.maximum(0, x)

# Sigmoid Function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Softmax Function
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtracting max for numerical stability
    return exp_x / exp_x.sum(axis=0)

# Example inputs
x = np.array([-1, 0, 1, 2])

# Outputs
print("Input:", x)
print("Step Function Output:", step_function(x))
print("ReLU Output:", relu(x))
print("Sigmoid Output:", sigmoid(x))
print("Softmax Output:", softmax(x))


Input: [-1  0  1  2]
Step Function Output: [0 1 1 1]
ReLU Output: [0 0 1 2]
Sigmoid Output: [0.26894142 0.5        0.73105858 0.88079708]
Softmax Output: [0.0320586  0.08714432 0.23688282 0.64391426]


3.7.3. Cost function

We define two functions,

Mean Squared Error (MSE): Used for regression tasks, it calculates the average squared difference between the actual values (y_true) and predicted values (y_pred). The function returns a value that quantifies how well the model's predictions match the true values.

Cross Entropy: Commonly used for binary classification, this function measures the difference between the true labels (y_true) and predicted probabilities (y_pred). It penalizes the model more heavily for incorrect predictions that are confident (i.e., closer to 0 or 1). This is especially useful when dealing with classification tasks that output probabilities.

In [None]:
import numpy as np

# Mean Squared Error (MSE) for regression problems
def mean_squared_error(y_true, y_pred):
    N = y_true.shape[0]  # Number of data points
    return 0.5 * np.sum((y_pred - y_true) ** 2) / N

# Cross Entropy for binary classification
def cross_entropy(y_true, y_pred):
    # Clipping values to avoid log(0)
    y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
    return -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) / y_true.shape[0]

# Sample data for testing
# For Mean Squared Error
y_true_regression = np.array([3.0, 0.5, 2.1, 7.8])
y_pred_regression = np.array([2.5, 0.7, 2.0, 7.5])

# For Cross Entropy
y_true_classification = np.array([1, 0, 1, 0])
y_pred_classification = np.array([0.9, 0.1, 0.8, 0.3])

# Calculating the cost for each
mse_cost = mean_squared_error(y_true_regression, y_pred_regression)
cross_entropy_cost = cross_entropy(y_true_classification, y_pred_classification)

# Outputs
print("Mean Squared Error Cost:", mse_cost)
print("Cross Entropy Cost:", cross_entropy_cost)


Mean Squared Error Cost: 0.04874999999999999
Cross Entropy Cost: 0.19763488164214868


3.7.4, 3.7.5 Backpropagation

We implement a simple backpropagation algorithm with a two-layer neural network. The forward pass computes the activations of each layer, and the backward pass calculates the gradients (deltas) for each layer. The weights and biases are then updated using gradient descent based on these gradients. The sigmoid and relu activation functions are used, and their derivatives are computed during backpropagation. We iterate for multiple epochs, adjusting the weights and biases to minimize the cost (Mean Squared Error in this case).

In [None]:
import numpy as np

# Activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(x > 0, 1, 0)

# Cost function (Mean Squared Error for simplicity)
def cost_function(y_true, y_pred):
    return np.mean(0.5 * (y_true - y_pred) ** 2)

# Backpropagation
def backpropagate(X, y, weights, biases, activation_function, activation_prime, learning_rate=0.01):
    # Forward pass
    z = [X]
    a = [X]
    for l in range(len(weights)):
        z_l = np.dot(a[-1], weights[l]) + biases[l]
        a_l = activation_function(z_l)
        z.append(z_l)
        a.append(a_l)

    # Compute cost (for monitoring)
    cost = cost_function(y, a[-1])

    # Backward pass
    delta = [None] * len(weights)
    delta[-1] = (a[-1] - y) * activation_prime(z[-1])  # output layer delta

    # Backpropagate the deltas to earlier layers
    for l in range(len(weights)-2, -1, -1):
        delta[l] = np.dot(delta[l+1], weights[l+1].T) * activation_prime(z[l+1])

    # Update weights and biases using gradient descent
    for l in range(len(weights)):
        weights[l] -= learning_rate * np.dot(a[l].T, delta[l])
        biases[l] -= learning_rate * np.sum(delta[l], axis=0, keepdims=True)

    return weights, biases, cost

# Example usage
np.random.seed(42)  # For reproducibility

# Initialize data and network parameters
X = np.random.randn(5, 3)  # 5 examples, 3 features
y = np.random.randn(5, 1)  # 5 target values
weights = [np.random.randn(3, 4), np.random.randn(4, 1)]  # Two layers
biases = [np.random.randn(1, 4), np.random.randn(1, 1)]  # Bias for each layer

# Train the network using backpropagation
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):
    weights, biases, cost = backpropagate(X, y, weights, biases, sigmoid, sigmoid_prime, learning_rate)

    if epoch % 100 == 0:  # Print cost every 100 iterations
        print(f"Epoch {epoch}, Cost: {cost}")

# Final output after training
print("Trained weights:", weights)
print("Trained biases:", biases)


Epoch 0, Cost: 1.1420671896035153
Epoch 100, Cost: 0.5599468233902358
Epoch 200, Cost: 0.4812692778925115
Epoch 300, Cost: 0.4596316116767897
Epoch 400, Cost: 0.4499189320933734
Epoch 500, Cost: 0.44446993634287174
Epoch 600, Cost: 0.4410015084532769
Epoch 700, Cost: 0.43860706760278256
Epoch 800, Cost: 0.4368578835854288
Epoch 900, Cost: 0.4355257442980521
Trained weights: [array([[ 1.58675611,  0.03730541, -0.01691816, -1.17880037],
       [-0.68099146, -0.3751891 , -1.07256865,  0.21584849],
       [-0.6681502 , -0.69096226, -0.53547336,  1.78042101]]), array([[-2.03361315],
       [-1.53020392],
       [-0.26727963],
       [-1.62900914]])]
Trained biases: [array([[ 0.4158142 , -1.51982163, -1.43777548,  0.52909988]]), array([[-1.54702053]])]
