<a href="https://colab.research.google.com/github/poojabisht10/Deep-Learning/blob/main/UCS761_Lab6_Learning_to_Bend_a_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [1]:
# We only use numpy for numerical operations.
import numpy as np

Input generation

In [2]:
# Create input data X
# X represents a single feature whose value increases gradually

X = np.linspace(0, 10, 50).reshape(-1, 1)

In [3]:
# Explanation:
# X represents an input variable that grows from 0 to 10.
# This range is chosen so that we can see:
# - fast growth in the beginning
# - slower growth later
# A straight line cannot fit both behaviors well.

Target generation

In [4]:
# Create target y with a non-linear relationship
# We add small noise so data looks realistic

np.random.seed(0)
noise = np.random.normal(0, 0.2, size=(50, 1))
y = np.log(X + 1) + noise

In [5]:
# Explanation:
# The logarithmic function grows quickly at first and then slows down.
# A linear model has only one slope, so it cannot fit this shape well.

Decide the Model Shape

In [6]:
# We choose 3 hidden units.
# Why more than 1?
#   A single hidden unit cannot bend the curve enough.
# Why not too many?
#   Too many hidden units increase complexity and instability.
# Three is a balanced choice for this simple dataset.

Initialize Model Parameters

In [7]:
# Initialize weights and biases

# W1 maps input → hidden layer
W1 = np.random.uniform(-1, 1, size=(1, 3))
b1 = np.zeros((1, 3))

# W2 maps hidden → output layer
W2 = np.random.uniform(-1, 1, size=(3, 1))
b2 = np.zeros((1, 1))

In [8]:
# Explanation:
# Weights control the shape and bending of the curve.
# Biases shift the curve up or down.
# Random initialization avoids symmetry.

Activation Function

In [9]:
def activation(z):
    # Keeps positive values and suppresses negative ones
    return np.maximum(0, z)

Activation Slope

In [10]:
def activation_slope(z):
    # Slope is 1 for positive values and 0 for negative values
    return (z > 0).astype(float)

In [11]:
# Why slope matters:
# Without slope, the model does not know how to change parameters.
# Slopes allow error information to flow backward.

Forward pass

In [14]:
# Step 1: Linear transformation
z1 = X @ W1 + b1

# Step 2: Apply activation
h = activation(z1)

# Step 3: Output prediction
y_hat = h @ W2 + b2

In [13]:
# Mapping to math:
# z1 = XW1 + b1
# h = activation(z1)
# y_hat = hW2 + b2

Error and loss

In [15]:
# Error is the difference between prediction and actual value
error = y_hat - y

# Mean Squared Error loss
loss = np.mean(error ** 2)

print("Initial loss:", loss)

Initial loss: 4.917843952937002


In [16]:
# Explanation:
# Squaring makes large errors more expensive.
# This forces the model to correct big mistakes strongly.

Why we need slopes?

In [17]:
# To reduce loss, we must change parameters.
# To know how to change them, we need slopes (gradients).
# Gradients tell us whether a small change increases or decreases loss.

Backward flow

In [18]:
# Gradient of loss w.r.t prediction
dL_dy = 2 * error / len(X)

# Gradients for output layer
dL_dW2 = h.T @ dL_dy
dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)

# Error flowing into hidden layer
dL_dh = dL_dy @ W2.T

# Apply activation slope
dL_dz1 = dL_dh * activation_slope(z1)

# Gradients for first layer
dL_dW1 = X.T @ dL_dz1
dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)

In [19]:
# Explanation:
# Hidden values affect output, so gradients depend on them.
# Activation controls which neurons pass error backward.

Parameter update rule

In [20]:
learning_rate = 0.01

# Update parameters by moving opposite to the gradient
W1 -= learning_rate * dL_dW1
b1 -= learning_rate * dL_db1
W2 -= learning_rate * dL_dW2
b2 -= learning_rate * dL_db2

In [21]:
# This is similar to the perceptron update rule.
# We move parameters in the opposite direction of error.

Training loop

In [22]:
epochs = 1000

for epoch in range(epochs):

    # Forward pass
    z1 = X @ W1 + b1
    h = activation(z1)
    y_hat = h @ W2 + b2

    # Loss
    error = y_hat - y
    loss = np.mean(error ** 2)

    # Backward pass
    dL_dy = 2 * error / len(X)
    dL_dW2 = h.T @ dL_dy
    dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
    dL_dh = dL_dy @ W2.T
    dL_dz1 = dL_dh * activation_slope(z1)
    dL_dW1 = X.T @ dL_dz1
    dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)

    # Update
    W1 -= learning_rate * dL_dW1
    b1 -= learning_rate * dL_db1
    W2 -= learning_rate * dL_dW2
    b2 -= learning_rate * dL_db2

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

Epoch 0, Loss: 3.8784
Epoch 100, Loss: 0.0736
Epoch 200, Loss: 0.0696
Epoch 300, Loss: 0.0683
Epoch 400, Loss: 0.0679
Epoch 500, Loss: 0.0677
Epoch 600, Loss: 0.0677
Epoch 700, Loss: 0.0677
Epoch 800, Loss: 0.0676
Epoch 900, Loss: 0.0676


In [23]:
# Choice justification:
# Epochs = 1000 gives enough time for learning to stabilize.
# Learning rate = 0.01 is small enough to avoid instability.

In [24]:
# This workshop shows:
# - Why linear models fail on curved data
# - How hidden layers allow bending
# - Why activation functions are needed
# - How error flows backward step by step
# Backpropagation is not magic, just organized slopes.