In [2]:
#The calculus of a simple neural network with one hidden layer
import math
import random

# defining sigmoid and its derivative
def sigmoid(z):
    return 1.0 / (1.0 + math.exp(-z))

def dsigmoid(y):
    # derivative w.r.t. z, when given y = sigmoid(z)
    return y * (1.0 - y)

# ----- Tiny dataset (2D input -> 2D one-hot target) -----
# We'll learn a simple decision boundary:
# class A (target [1,0]) when x1 >= x2, else class B (target [0,1])
data = [
    # (x1, x2), (t1, t2)
    ((0.0, 0.0), (1.0, 0.0)),
    ((0.0, 1.0), (0.0, 1.0)),
    ((1.0, 0.0), (1.0, 0.0)),
    ((1.0, 1.0), (1.0, 0.0)),
    ((0.2, 0.8), (0.0, 1.0)),
    ((0.8, 0.2), (1.0, 0.0)),
]

# Initialize parameters
def rand():
    return random.uniform(-0.5, 0.5)

w1, w2, b1 = rand(), rand(), rand()
w3, w4, b2 = rand(), rand(), rand()
w5, w6, b3 = rand(), rand(), rand()
w7, w8, b4 = rand(), rand(), rand()

lr = 0.5          # learning rate
epochs = 5000     # iterations

for epoch in range(epochs):
    random.shuffle(data)

    total_loss = 0.0
    for (x1, x2), (t1, t2) in data:
        # Forward pass
        # Hidden layer
        z1 = w1*x1 + w2*x2 + b1
        h1 = sigmoid(z1)
        z2 = w3*x1 + w4*x2 + b2
        h2 = sigmoid(z2)

        # Output layer
        z3 = w5*h1 + w6*h2 + b3
        y1 = sigmoid(z3)
        z4 = w7*h1 + w8*h2 + b4
        y2 = sigmoid(z4)

        # Loss = 1/2 * [(t1 - y1)^2 + (t2 - y2)^2]
        L = 0.5*((t1 - y1)**2 + (t2 - y2)**2)
        total_loss += L

        # Backward pass
        # Output layer gradients
        dL_dy1 = -(t1 - y1)
        dL_dy2 = -(t2 - y2)

        dy1_dz3 = dsigmoid(y1)
        dy2_dz4 = dsigmoid(y2)

        # Gradients for w5, w6, b3 (affecting y1)
        dL_dw5 = dL_dy1 * dy1_dz3 * h1
        dL_dw6 = dL_dy1 * dy1_dz3 * h2
        dL_db3 = dL_dy1 * dy1_dz3 * 1.0

        # Gradients for w7, w8, b4 (affecting y2)
        dL_dw7 = dL_dy2 * dy2_dz4 * h1
        dL_dw8 = dL_dy2 * dy2_dz4 * h2
        dL_db4 = dL_dy2 * dy2_dz4 * 1.0

        # Hidden layer error terms (sum of paths from both outputs)
        dL_dh1 = (dL_dy1 * dy1_dz3 * w5) + (dL_dy2 * dy2_dz4 * w7)
        dL_dh2 = (dL_dy1 * dy1_dz3 * w6) + (dL_dy2 * dy2_dz4 * w8)

        dh1_dz1 = dsigmoid(h1)
        dh2_dz2 = dsigmoid(h2)

        # Gradients for w1, w2, b1 (h1 branch)
        dL_dw1 = dL_dh1 * dh1_dz1 * x1
        dL_dw2 = dL_dh1 * dh1_dz1 * x2
        dL_db1 = dL_dh1 * dh1_dz1 * 1.0

        # Gradients for w3, w4, b2 (h2 branch)
        dL_dw3 = dL_dh2 * dh2_dz2 * x1
        dL_dw4 = dL_dh2 * dh2_dz2 * x2
        dL_db2 = dL_dh2 * dh2_dz2 * 1.0

        # Gradient descent updates on weights
        w5 -= lr * dL_dw5
        w6 -= lr * dL_dw6
        b3 -= lr * dL_db3

        w7 -= lr * dL_dw7
        w8 -= lr * dL_dw8
        b4 -= lr * dL_db4

        w1 -= lr * dL_dw1
        w2 -= lr * dL_dw2
        b1 -= lr * dL_db1

        w3 -= lr * dL_dw3
        w4 -= lr * dL_dw4
        b2 -= lr * dL_db2

    if (epoch+1) % 500 == 0:
        print(f"Epoch {epoch+1:4d} | loss {total_loss/len(data):.6f}")

# test after training
def predict(x1, x2):
    z1 = w1*x1 + w2*x2 + b1
    h1 = sigmoid(z1)
    z2 = w3*x1 + w4*x2 + b2
    h2 = sigmoid(z2)
    z3 = w5*h1 + w6*h2 + b3
    y1 = sigmoid(z3)
    z4 = w7*h1 + w8*h2 + b4
    y2 = sigmoid(z4)
    return y1, y2

tests = [(0.0,0.0),(0.0,1.0),(1.0,0.0),(1.0,1.0),(0.2,0.8),(0.8,0.2)]
for x1,x2 in tests:
    y1,y2 = predict(x1,x2)
    pred = 0 if y1 > y2 else 1
    print(f"x=({x1:.1f},{x2:.1f}) -> y=({y1:.3f},{y2:.3f})  class={pred}")

Epoch  500 | loss 0.002250
Epoch 1000 | loss 0.000817
Epoch 1500 | loss 0.000479
Epoch 2000 | loss 0.000334
Epoch 2500 | loss 0.000255
Epoch 3000 | loss 0.000205
Epoch 3500 | loss 0.000171
Epoch 4000 | loss 0.000146
Epoch 4500 | loss 0.000127
Epoch 5000 | loss 0.000113
x=(0.0,0.0) -> y=(0.987,0.013)  class=0
x=(0.0,1.0) -> y=(0.003,0.997)  class=1
x=(1.0,0.0) -> y=(0.998,0.002)  class=0
x=(1.0,1.0) -> y=(0.987,0.013)  class=0
x=(0.2,0.8) -> y=(0.018,0.982)  class=1
x=(0.8,0.2) -> y=(0.998,0.002)  class=0


In [3]:
#Matrix based learning approach (comp
import numpy as np

# Sigmoid activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# Input dataset
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

# Output dataset
y = np.array([
    [0],
    [1],
    [1],
    [0]
])

# Initialize weights and biases with random values
input_neurons = 2
hidden_neurons = 10
output_neurons = 1

hidden_weights = np.random.uniform(size=(input_neurons, hidden_neurons))
hidden_bias = np.random.uniform(size=(1, hidden_neurons))

output_weights = np.random.uniform(size=(hidden_neurons, output_neurons))
output_bias = np.random.uniform(size=(1, output_neurons))

learning_rate = 0.1
epochs = 10000

# Training the neural network
for epoch in range(epochs):
    # Forward propagation
    # Input to Hidden Layer
    hidden_layer_activation = np.dot(X, hidden_weights) + hidden_bias
    hidden_layer_output = sigmoid(hidden_layer_activation)

    # Hidden to Output Layer
    output_layer_activation = np.dot(hidden_layer_output, output_weights) + output_bias
    predicted_output = sigmoid(output_layer_activation)

    # Backpropagation
    # Calculate the error
    error = y - predicted_output

    # Compute the gradients for output layer
    output_delta = error * sigmoid_derivative(predicted_output)
    hidden_layer_error = output_delta.dot(output_weights.T)
    hidden_delta = hidden_layer_error * sigmoid_derivative(hidden_layer_output)

    # Update weights and biases
    output_weights += hidden_layer_output.T.dot(output_delta) * learning_rate
    output_bias += np.sum(output_delta, axis=0, keepdims=True) * learning_rate
    hidden_weights += X.T.dot(hidden_delta) * learning_rate
    hidden_bias += np.sum(hidden_delta, axis=0, keepdims=True) * learning_rate

    if epoch % 1000 == 0:
        print(f"Error: {np.mean(np.abs(error))}")

print("Output after training:")
print(predicted_output)

Error: 0.4989209769728176
Error: 0.4928010095250592
Error: 0.4524517814135708
Error: 0.33737448122748165
Error: 0.18425789901307968
Error: 0.11117311085940629
Error: 0.0800143863681107
Error: 0.06360704899926437
Error: 0.05352719284732321
Error: 0.04667937944525244
Output after training:
[[0.02955785]
 [0.95947811]
 [0.95450775]
 [0.05123815]]
