# Forward + Backward Prop


## 0) Imports + setup

In [13]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 1) Build the tiny network (Sequential only)

\[
x \rightarrow z_1 = w_1x + b_1 \rightarrow h_1 = \max(0, z_1) \rightarrow y_{pred} = w_2h_1 + b_2
\]

We set weights manually so numbers are predictable.

In [14]:
model = nn.Sequential(
    nn.Linear(1, 1),  # layer 0
    nn.ReLU(),        # layer 1
    nn.Linear(1, 1)   # layer 2
).to(device)

with torch.no_grad():
    model[0].weight.fill_(2.0)  # w1 = 2
    model[0].bias.fill_(0.0)    # b1 = 0
    model[2].weight.fill_(3.0)  # w2 = 3
    model[2].bias.fill_(0.0)    # b2 = 0

model

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1, out_features=1, bias=True)
)

## 2) Single training example

- `x` shape: `(1,1)`
- `y_true` shape: `(1,1)`

MSE:
\[
L = (y_{pred} - y_{true})^2
\]

In [15]:
x = torch.tensor([[1.0]], device=device)
y_true = torch.tensor([[1.0]], device=device)

criterion = nn.MSELoss(reduction="mean")
x, y_true

(tensor([[1.]]), tensor([[1.]]))

## 3) Forward pass — step by step

We compute:
- `z1` (first linear)
- `h1` (ReLU)
- `y_pred` (second linear)
- `loss` (MSE)

We also print `grad_fn` nodes:
- `AddmmBackward...` for Linear
- `ReluBackward...` for ReLU

In [16]:
z1 = model[0](x)
h1 = model[1](z1)
y_pred = model[2](h1)
loss = criterion(y_pred, y_true)

print("FORWARD VALUES")
print("x      =", x.item())
print("z1     =", z1.item())
print("h1     =", h1.item())
print("y_pred =", y_pred.item())
print("y_true =", y_true.item())
print("loss   =", loss.item())

print("\nAUTOGRAD NODES (grad_fn)")
print("z1.grad_fn     =", z1.grad_fn)
print("h1.grad_fn     =", h1.grad_fn)
print("y_pred.grad_fn =", y_pred.grad_fn)
print("loss.grad_fn   =", loss.grad_fn)

FORWARD VALUES
x      = 1.0
z1     = 2.0
h1     = 2.0
y_pred = 6.0
y_true = 1.0
loss   = 25.0

AUTOGRAD NODES (grad_fn)
z1.grad_fn     = <AddmmBackward0 object at 0x7b701aabab60>
h1.grad_fn     = <ReluBackward0 object at 0x7b701aabab60>
y_pred.grad_fn = <AddmmBackward0 object at 0x7b701aabab60>
loss.grad_fn   = <MseLossBackward0 object at 0x7b701aabab60>


## 4) Backward pass — gradients on parameters

After `loss.backward()`, gradients appear in `.grad` of **parameters**.

In [17]:
model.zero_grad()
loss.backward()

print("GRADS AFTER BACKWARD")
print("dL/dw2 =", model[2].weight.grad.item())
print("dL/db2 =", model[2].bias.grad.item())
print("dL/dw1 =", model[0].weight.grad.item())
print("dL/db1 =", model[0].bias.grad.item())

GRADS AFTER BACKWARD
dL/dw2 = 20.0
dL/db2 = 10.0
dL/dw1 = 30.0
dL/db1 = 30.0


## 5) ReLU gate demo: force `z1 <= 0` so gradients stop

ReLU derivative:
- if `z1 > 0` → pass gradient
- if `z1 <= 0` → block gradient (multiply by 0)

In [7]:
with torch.no_grad():
    model[0].weight.fill_(-2.0)
    model[0].bias.fill_(-0.5)
    model[2].weight.fill_(3.0)
    model[2].bias.fill_(0.0)

z1 = model[0](x)
h1 = model[1](z1)
y_pred = model[2](h1)
loss = criterion(y_pred, y_true)

print("FORWARD VALUES (ReLU OFF case)")
print("z1     =", z1.item())
print("h1     =", h1.item())
print("y_pred =", y_pred.item())
print("loss   =", loss.item())

model.zero_grad()
loss.backward()

print("\nGRADS (ReLU OFF case)")
print("dL/dw2 =", model[2].weight.grad.item())
print("dL/db2 =", model[2].bias.grad.item())
print("dL/dw1 =", model[0].weight.grad.item())
print("dL/db1 =", model[0].bias.grad.item())

FORWARD VALUES (ReLU OFF case)
z1     = -2.5
h1     = 0.0
y_pred = 0.0
loss   = 1.0

GRADS (ReLU OFF case)
dL/dw2 = 0.0
dL/db2 = -2.0
dL/dw1 = 0.0
dL/db1 = 0.0


In [43]:
from torch.autograd.graph import saved_tensors_hooks


In [44]:
events = []

def pack_hook(t):
    # called when autograd SAVES a tensor during forward
    events.append(("SAVE in forward", tuple(t.shape), str(t.dtype), t.item()))
    return t

def unpack_hook(t):
    # called when backward READS a saved tensor
    events.append(("LOAD in backward", tuple(t.shape), str(t.dtype), t.item()))
    return t

x = torch.tensor([[1.0]], requires_grad=True)
y_true = torch.tensor([[1.0]])

with saved_tensors_hooks(pack_hook, unpack_hook):
    y_pred = model(x)
    loss = criterion(y_pred, y_true)
    model.zero_grad()
    loss.backward()

for i, (tag, shape, dtype, val) in enumerate(events):
    print(f"{i:02d} | {tag:15s} | shape={shape} dtype={dtype} value={val}")

00 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=1.0
01 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=2.0
02 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=2.0
03 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=2.0
04 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=3.0
05 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=6.0
06 | SAVE in forward | shape=(1, 1) dtype=torch.float32 value=1.0
07 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=6.0
08 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=1.0
09 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=2.0
10 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=3.0
11 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=2.0
12 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=1.0
13 | LOAD in backward | shape=(1, 1) dtype=torch.float32 value=2.0
