In [5]:
from pathlib import Path

from deeplearning.homework_04.grader.datasets import road_dataset

path = Path().cwd().parent / "homework_04" / "drive_data" / "val"
print(path)


/Users/timreddick/code/AI394D-deep-learning/deeplearning/homework_04/drive_data/val


In [6]:
data = road_dataset.load_data(path)

Loaded 2000 samples from 4 episodes


In [9]:
import torch

# Create a 1D tensor with 12 elements
original_tensor = torch.arange(12)
print("Original Tensor:")
print(original_tensor)

# Reshape the tensor into a 2D tensor of shape (3, 4)
reshaped_tensor = original_tensor.reshape(3, 4)
print("\nReshaped Tensor (3x4):")
print(reshaped_tensor)

# Reshape the tensor into a 3D tensor of shape (2, 2, 3)
reshaped_tensor_3d = original_tensor.reshape(2, 2, 3)
print("\nReshaped Tensor (2x2x3):")
print(reshaped_tensor_3d)

# Demonstrating that the original tensor remains unchanged
print("\nOriginal Tensor After Reshaping:")
print(original_tensor)

# What does -1 mean in reshape?
# The -1 in reshape allows PyTorch to automatically calculate the size of that dimension based on the other dimensions and the total number of elements.
# For example, if we want to reshape the original tensor into a shape of (3, -1), PyTorch will automatically determine the second dimension size.
reshaped_tensor_auto = original_tensor.reshape(3, -1)
print("\nReshaped Tensor with -1 (3x-1):")
print(reshaped_tensor_auto)


Original Tensor:
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

Reshaped Tensor (3x4):
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

Reshaped Tensor (2x2x3):
tensor([[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]]])

Original Tensor After Reshaping:
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

Reshaped Tensor with -1 (3x-1):
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])


In [26]:
# https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/


import math

# Hidden layer 1 calculation
net_h1 = 0.15 * 0.05 + 0.20 * 0.10 + 0.35
out_h1 = 1 / (1 + math.exp(-net_h1))
print(f"Output of Hidden Layer 1 (h1): {out_h1:.8f}")

# Hidden layer 2 calculation
net_h2 = 0.25 * 0.05 + 0.30 * 0.10 + 0.35
out_h2 = 1 / (1 + math.exp(-net_h2))
print(f"Output of Hidden Layer 2 (h2): {out_h2:.8f}")

# Output layer 1 calculation
net_o1 = 0.40 * out_h1 + 0.45 * out_h2 + 0.60
out_o1 = 1 / (1 + math.exp(-net_o1))
print(f"Output of Output Layer 1 (o1): {out_o1:.8f}")

# Output layer 2 calculation
net_o2 = 0.50 * out_h1 + 0.55 * out_h2 + 0.60
out_o2 = 1 / (1 + math.exp(-net_o2))
print(f"Output of Output Layer 2 (o2): {out_o2:.8f}")

# Calculate loss
target_o1 = 0.01
target_o2 = 0.99
loss_1 = 1/2 * ((target_o1 - out_o1) ** 2)
loss_2 = 1/2 * ((target_o2 - out_o2) ** 2)
loss = loss_1 + loss_2
print(f"\nLoss for Output Layer 1: {loss_1:.8f}")
print(f"Loss for Output Layer 2: {loss_2:.8f}")
print(f"Total Loss: {loss:.8f}")

Output of Hidden Layer 1 (h1): 0.59326999
Output of Hidden Layer 2 (h2): 0.59688438
Output of Output Layer 1 (o1): 0.75136507
Output of Output Layer 2 (o2): 0.77292847

Loss for Output Layer 1: 0.27481108
Loss for Output Layer 2: 0.02356003
Total Loss: 0.29837111


## PyTorch Automatic Gradients Example

This example demonstrates how PyTorch automatically calculates gradients without needing to explicitly define the gradient form.

In [33]:
import torch

# Create tensors with requires_grad=True to track computations
w1 = torch.tensor(0.15, requires_grad=True)
w2 = torch.tensor(0.20, requires_grad=True)
w3 = torch.tensor(0.25, requires_grad=True)
w4 = torch.tensor(0.30, requires_grad=True)
w5 = torch.tensor(0.40, requires_grad=True)  # The weight we were analyzing
w6 = torch.tensor(0.45, requires_grad=True)
w7 = torch.tensor(0.50, requires_grad=True)
w8 = torch.tensor(0.55, requires_grad=True)

b1 = torch.tensor(0.35, requires_grad=True)
b2 = torch.tensor(0.60, requires_grad=True)

# Input values
i1 = torch.tensor(0.05)
i2 = torch.tensor(0.10)

# Forward pass - same as our manual calculations but using PyTorch
# Hidden layer
net_h1 = w1 * i1 + w3 * i2 + b1
out_h1 = 1 / (1 + torch.exp(-net_h1))

net_h2 = w2 * i1 + w4 * i2 + b1
out_h2 = 1 / (1 + torch.exp(-net_h2))

# Output layer
net_o1 = w5 * out_h1 + w6 * out_h2 + b2
out_o1 = 1 / (1 + torch.exp(-net_o1))

net_o2 = w7 * out_h1 + w8 * out_h2 + b2
out_o2 = 1 / (1 + torch.exp(-net_o2))

# Target values
t1 = torch.tensor(0.01)
t2 = torch.tensor(0.99)

# Calculate the error (loss)
error_o1 = 0.5 * (t1 - out_o1) ** 2
error_o2 = 0.5 * (t2 - out_o2) ** 2
error_total = error_o1 + error_o2

# Backpropagation - PyTorch calculates all gradients automatically!
error_total.backward()

# Print the gradient of the total error with respect to w5
print(f"∂E_total/∂w5 = {w5.grad:.9f}")
print(f"\nCompare to manually calculated value in the article: 0.082167041")

# You can also check gradients for other parameters
print(f"\nGradients for other parameters:")
print(f"∂E_total/∂w1 = {w1.grad:.9f}")
print(f"∂E_total/∂w2 = {w2.grad:.9f}")
print(f"∂E_total/∂b1 = {b1.grad:.9f}")
print(f"∂E_total/∂b2 = {b2.grad:.9f}")

∂E_total/∂w5 = 0.082329690

Compare to manually calculated value in the article: 0.082167041

Gradients for other parameters:
∂E_total/∂w1 = 0.000438205
∂E_total/∂w2 = 0.000498006
∂E_total/∂b1 = 0.018724229
∂E_total/∂b2 = 0.100407004


## Simple Automatic Differentiation Example

This demonstrates how PyTorch calculates gradients behind the scenes without knowing the form of the gradient beforehand.

In [None]:
import torch

# Create a tensor with requires_grad=True
x = torch.tensor(2.0, requires_grad=True)
print(f"Initial x value: {x.item()}")

# Step 1: Perform forward computation
# Let's say we compute y = x^2
y = x * x
print(f"y = x^2 = {y.item()}")

# Step 2: Call backward() to compute gradients
y.backward()

# Step 3: Access the computed gradient
print(f"Gradient of y with respect to x (dy/dx): {x.grad.item()}")
print(f"Expected value: 2x = 2*2 = 4")
print("\nPyTorch calculated the correct gradient dy/dx = 4 without us specifying the formula dy/dx = 2x")

# Let's see a more complex example
print("\n----- More Complex Example -----")
a = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(2.0, requires_grad=True)

# Complex computation: z = (a^2 + b)^3
c = a * a + b
z = c * c * c
print(f"z = (a^2 + b)^3 = {z.item()}")

# Compute gradients
z.backward()

# Access gradients
print(f"dz/da = {a.grad.item()}")
print(f"dz/db = {b.grad.item()}")
print("\nPyTorch correctly calculated these gradients through chain rule without us explicitly providing the formulas:")
print("dz/da = 3(a^2 + b)^2 * 2a")
print("dz/db = 3(a^2 + b)^2")

## Gradient Reuse in Backpropagation

This demonstrates why we can reuse gradient calculations in backpropagation, as mentioned in the article. When computing the gradient for weights between input and hidden layers, we reuse components calculated for the output layer weights.

In [None]:
# Manual calculations to demonstrate gradient reuse in backpropagation
import math

# Forward pass calculations (same as before)
# Input values
i1, i2 = 0.05, 0.10

# Weights
w1, w2 = 0.15, 0.20  # w1, w2 connect i1, i2 to h1
w3, w4 = 0.25, 0.30  # w3, w4 connect i1, i2 to h2
w5, w6 = 0.40, 0.45  # w5, w6 connect h1, h2 to o1
w7, w8 = 0.50, 0.55  # w7, w8 connect h1, h2 to o2

# Biases
b1 = 0.35  # hidden layer bias
b2 = 0.60  # output layer bias

# Target values
t1, t2 = 0.01, 0.99

# Hidden layer outputs
net_h1 = w1 * i1 + w3 * i2 + b1
out_h1 = 1 / (1 + math.exp(-net_h1))

net_h2 = w2 * i1 + w4 * i2 + b1
out_h2 = 1 / (1 + math.exp(-net_h2))

# Output layer outputs
net_o1 = w5 * out_h1 + w6 * out_h2 + b2
out_o1 = 1 / (1 + math.exp(-net_o1))

net_o2 = w7 * out_h1 + w8 * out_h2 + b2
out_o2 = 1 / (1 + math.exp(-net_o2))

# Calculate error
E_o1 = 0.5 * (t1 - out_o1) ** 2
E_o2 = 0.5 * (t2 - out_o2) ** 2
E_total = E_o1 + E_o2

print(f"out_o1: {out_o1:.8f}, out_o2: {out_o2:.8f}")
print(f"E_total: {E_total:.8f}\n")

# Backpropagation
# First, calculate gradients for output layer weights

# For w5 (weight connecting h1 to o1)
# Step 1: Calculate ∂E_o1/∂out_o1
dE_o1_dout_o1 = -(t1 - out_o1)
print(f"∂E_o1/∂out_o1 = {dE_o1_dout_o1:.8f}")

# Step 2: Calculate ∂out_o1/∂net_o1
dout_o1_dnet_o1 = out_o1 * (1 - out_o1)
print(f"∂out_o1/∂net_o1 = {dout_o1_dnet_o1:.8f}")

# Step 3: Calculate ∂net_o1/∂w5
dnet_o1_dw5 = out_h1
print(f"∂net_o1/∂w5 = {dnet_o1_dw5:.8f}")

# Step 4: Calculate ∂E_o1/∂w5 (chain rule)
dE_o1_dw5 = dE_o1_dout_o1 * dout_o1_dnet_o1 * dnet_o1_dw5
print(f"∂E_o1/∂w5 = {dE_o1_dw5:.8f}\n")

# IMPORTANT PART - Reusing calculations for w1
# To calculate ∂E_o1/∂w1, we need first ∂E_o1/∂out_h1

# Step 1: We already calculated ∂E_o1/∂out_o1 and ∂out_o1/∂net_o1
dE_o1_dnet_o1 = dE_o1_dout_o1 * dout_o1_dnet_o1
print("This is where we REUSE the calculation!")
print(f"∂E_o1/∂net_o1 = ∂E_o1/∂out_o1 * ∂out_o1/∂net_o1 = {dE_o1_dnet_o1:.8f}")

# Step 2: Calculate ∂net_o1/∂out_h1
dnet_o1_dout_h1 = w5
print(f"∂net_o1/∂out_h1 = {dnet_o1_dout_h1:.8f}")

# Step 3: Calculate ∂E_o1/∂out_h1 (chain rule)
dE_o1_dout_h1 = dE_o1_dnet_o1 * dnet_o1_dout_h1
print(f"∂E_o1/∂out_h1 = {dE_o1_dout_h1:.8f}\n")

print("The calculated value matches what's in the article (0.138498562)")
print("This demonstrates how gradient calculations are reused during backpropagation.")

## Visual Representation of Backpropagation Flow

Backpropagation flows from output to input, with gradient calculations being reused:

```
Input Layer    Hidden Layer(s)    Output Layer
   [i1]             [h1]             [o1]
     |               /|\              / \
     |              / | \            /   \
    w1,w2         w5,w7 |          /     \
     |            /   \ |         /       \
     |           /     \|        /         \
   [i2]------->[h2]     -----> [o2]       Error
                                          |
                                          |
           Backpropagation Flow          \|/
           <---------------------------- Start here
```

Backpropagation process:

1. Calculate output layer errors (o1, o2)
2. Update weights connecting hidden to output (w5, w6, w7, w8)
   - Calculate ∂E/∂w5, ∂E/∂w6, ∂E/∂w7, ∂E/∂w8
3. **Reuse components** to calculate hidden layer gradients
   - Reuse ∂E/∂net_o1 when calculating ∂E/∂out_h1
4. Update weights connecting input to hidden (w1, w2, w3, w4)

This reuse of calculations makes backpropagation efficient, especially for deep networks.

In [None]:
# Simplified backpropagation process visualization
print("Backpropagation Process: Output → Hidden → Input")
print("-" * 50)
print("1. Forward pass (input → hidden → output)")
print("2. Calculate error at output layer")
print("3. Update output layer weights (w5, w6, w7, w8)")
print("   - Calculate: ∂E/∂w_output = ∂E/∂out_o * ∂out_o/∂net_o * ∂net_o/∂w_output")
print("4. REUSE calculation ∂E/∂net_o for hidden layer")
print("   - Hidden gradients: ∂E/∂out_h = ∂E/∂net_o * ∂net_o/∂out_h")
print("5. Update hidden layer weights (w1, w2, w3, w4)")
print("   - ∂E/∂w_hidden = ∂E/∂out_h * ∂out_h/∂net_h * ∂net_h/∂w_hidden")