In [1]:
import torch
import torch.nn as nn

# Define the custom network with named ReLU layers
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.h1 = nn.Linear(3, 5)
        self.relu1 = nn.ReLU()
        self.h2 = nn.Linear(5, 2)
        self.relu2 = nn.ReLU()
        self.h3 = nn.Linear(2, 3)
        self.relu3 = nn.ReLU()
        self.h4 = nn.Linear(3, 2)
        self.relu4 = nn.ReLU()
        self.h5 = nn.Linear(2, 2)
        self.relu5 = nn.ReLU()
        self.h6 = nn.Linear(2, 2)
        self.relu6 = nn.ReLU()
        self.h7 = nn.Linear(2, 1)

        self.h1.weight.data = torch.tensor([[0, 0, 1],[0, 1, 0],[1, 0, 0],[1, 1, 0],[0, 1, 1]], dtype=torch.float32)
        self.h2.weight.data = torch.tensor([[1, 1, -1, 0, 0],[0, 0, 1, 1, -1]], dtype=torch.float32)
        self.h3.weight.data = torch.tensor([[1, 1],[1, -1],[1, 2]], dtype=torch.float32)
        self.h4.weight.data = torch.tensor([[1, -1, 0],[0, -1, 1]], dtype=torch.float32)
        self.h5.weight.data = torch.tensor([[0, 1],[1, 0]], dtype=torch.float32)
        self.h6.weight.data = torch.tensor([[1, -1],[1, 1]], dtype=torch.float32)
        self.h7.weight.data = torch.tensor([[1, -1]], dtype=torch.float32)

        B = torch.tensor([0, 0, 0, 0, 0], dtype=torch.float32)
        self.h1.bias.data = B
        self.h2.bias.data = B[:2]
        self.h3.bias.data = B[:3]
        self.h4.bias.data = B[:2]
        self.h5.bias.data = B[:2]
        self.h6.bias.data = B[:2]
        self.h7.bias.data = torch.tensor([0.], dtype=torch.float32)

    def forward(self, input):
        out = self.relu1(self.h1(input))
        out = self.relu2(self.h2(out))
        out = self.relu3(self.h3(out))
        out = self.relu4(self.h4(out))
        out = self.relu5(self.h5(out))
        out = self.relu6(self.h6(out))
        out = self.h7(out)
        return out

# Custom MSE loss using autograd
class CustomMSELossFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, target):
        ctx.save_for_backward(input, target)
        return ((input - target) ** 2).mean()

    @staticmethod
    def backward(ctx, grad_output):
        input, target = ctx.saved_tensors
        grad_input = 2.0 * (input - target) / input.size(0) * grad_output
        print("grad_output from loss:", grad_output)
        print("grad_input from loss:", grad_input )
        return grad_input, None

# Register backward hooks
def register_hooks(model):
    hooks = []

    relu_count = 0
    linear_count = 0

    for layer in model.modules():
        if isinstance(layer, nn.ReLU):
            layer_id = relu_count
            relu_count += 1

            def make_relu_forward_hook(index):
                def relu_forward_hook(module, input, output):
                    print(f"\n>>> ReLU Forward [Layer {index}]")
                    print(f"Input to ReLU:\n{input[0]}")
                    print(f"Output from ReLU:\n{output}")
                return relu_forward_hook

            def make_relu_backward_hook(index):
                def relu_backward_hook(module, grad_input, grad_output):
                    print(f"\n<<< ReLU Backward [Layer {index}]")
                    print(f"Gradient BEFORE ReLU (grad_output):\n{grad_output[0]}")
                    print(f"Gradient AFTER ReLU (grad_input):\n{grad_input[0]}")
                return relu_backward_hook

            hooks.append(layer.register_forward_hook(make_relu_forward_hook(layer_id)))
            hooks.append(layer.register_full_backward_hook(make_relu_backward_hook(layer_id)))

        elif isinstance(layer, nn.Linear):
            layer_id = linear_count
            linear_count += 1

            def make_linear_backward_hook(index):
                def linear_backward_hook(module, grad_input, grad_output):
                    print(f"\n<<< Linear Backward [Layer {index}]")
                    print(f"grad_input:\n{grad_input}")
                    print(f"grad_output:\n{grad_output}")
                return linear_backward_hook

            hooks.append(layer.register_full_backward_hook(make_linear_backward_hook(layer_id)))

    return hooks

# Input and target
x = torch.tensor([[3, 4, 5], [5, 4, 3]], dtype=torch.float32, requires_grad=True)
target = torch.tensor([[1.], [2.]])

# Initialize model
model = MyNetwork()
model.train()

# Register hooks
hooks = register_hooks(model)

# Forward and backward
output = model(x)
loss = CustomMSELossFunction.apply(output, target)
print("Loss:", loss.item())
loss.backward()

# Clean up
for h in hooks:
    h.remove()



>>> ReLU Forward [Layer 0]
Input to ReLU:
tensor([[5., 4., 3., 7., 9.],
        [3., 4., 5., 9., 7.]], grad_fn=<BackwardHookFunctionBackward>)
Output from ReLU:
tensor([[5., 4., 3., 7., 9.],
        [3., 4., 5., 9., 7.]], grad_fn=<ReluBackward0>)

>>> ReLU Forward [Layer 1]
Input to ReLU:
tensor([[6., 1.],
        [2., 7.]], grad_fn=<BackwardHookFunctionBackward>)
Output from ReLU:
tensor([[6., 1.],
        [2., 7.]], grad_fn=<ReluBackward0>)

>>> ReLU Forward [Layer 2]
Input to ReLU:
tensor([[ 7.,  5.,  8.],
        [ 9., -5., 16.]], grad_fn=<BackwardHookFunctionBackward>)
Output from ReLU:
tensor([[ 7.,  5.,  8.],
        [ 9.,  0., 16.]], grad_fn=<ReluBackward0>)

>>> ReLU Forward [Layer 3]
Input to ReLU:
tensor([[ 2.,  3.],
        [ 9., 16.]], grad_fn=<BackwardHookFunctionBackward>)
Output from ReLU:
tensor([[ 2.,  3.],
        [ 9., 16.]], grad_fn=<ReluBackward0>)

>>> ReLU Forward [Layer 4]
Input to ReLU:
tensor([[ 3.,  2.],
        [16.,  9.]], grad_fn=<BackwardHookFunctionBac

# Epoch 2

In [12]:

import torch
import torch.nn as nn

# Track current epoch externally
current_epoch = {'epoch': 0}  # Use dict to allow mutation from inner scope

# Define the custom network with named ReLU layers
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.h1 = nn.Linear(3, 5)
        self.relu1 = nn.ReLU()
        self.h2 = nn.Linear(5, 2)
        self.relu2 = nn.ReLU()
        self.h3 = nn.Linear(2, 3)
        self.relu3 = nn.ReLU()
        self.h4 = nn.Linear(3, 2)
        self.relu4 = nn.ReLU()
        self.h5 = nn.Linear(2, 2)
        self.relu5 = nn.ReLU()
        self.h6 = nn.Linear(2, 2)
        self.relu6 = nn.ReLU()
        self.h7 = nn.Linear(2, 1)

        self.h1.weight.data = torch.tensor([[0, 0, 1],[0, 1, 0],[1, 0, 0],[1, 1, 0],[0, 1, 1]], dtype=torch.float32)
        self.h2.weight.data = torch.tensor([[1, 1, -1, 0, 0],[0, 0, 1, 1, -1]], dtype=torch.float32)
        self.h3.weight.data = torch.tensor([[1, 1],[1, -1],[1, 2]], dtype=torch.float32)
        self.h4.weight.data = torch.tensor([[1, -1, 0],[0, -1, 1]], dtype=torch.float32)
        self.h5.weight.data = torch.tensor([[0, 1],[1, 0]], dtype=torch.float32)
        self.h6.weight.data = torch.tensor([[1, -1],[1, 1]], dtype=torch.float32)
        self.h7.weight.data = torch.tensor([[1, -1]], dtype=torch.float32)

        B = torch.tensor([0, 0, 0, 0, 0], dtype=torch.float32)
        self.h1.bias.data = B
        self.h2.bias.data = B[:2]
        self.h3.bias.data = B[:3]
        self.h4.bias.data = B[:2]
        self.h5.bias.data = B[:2]
        self.h6.bias.data = B[:2]
        self.h7.bias.data = torch.tensor([0.], dtype=torch.float32)

    def forward(self, input):
        out = self.relu1(self.h1(input))
        out = self.relu2(self.h2(out))
        out = self.relu3(self.h3(out))
        out = self.relu4(self.h4(out))
        out = self.relu5(self.h5(out))
        out = self.relu6(self.h6(out))
        out = self.h7(out)
        return out

# Custom MSE loss using autograd
class CustomMSELossFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, target):
        ctx.save_for_backward(input, target)
        return ((input - target) ** 2).mean()

    @staticmethod
    def backward(ctx, grad_output):
        print("*"*10)
        input, target = ctx.saved_tensors
        print(input.numel(),input.size(0))
        batch_size = input.size(0)
        grad_input = 2.0 * (input - target) / batch_size #input.numel() --> Returns the total number of elements in the tensor, across all dimensions.

        print("grad_output from loss:", grad_output)
        print("grad_input from loss:", grad_input * grad_output)
        return grad_input * grad_output, None

# Register hooks
def register_hooks(model, current_epoch):
    hooks = []

    def make_forward_hook(name, module):
        def forward_hook(module, input, output):
            if current_epoch['epoch'] == 1:  # Only print during epoch 2
                print(f"\n>>> Forward Output from {name}")
                print(output)
                if isinstance(module, nn.Linear):
                    print(f">>> Weights of {name} after forward:")
                    print(module.weight.data)
                    if module.bias is not None:
                        print(f">>> Bias of {name} after forward:")
                        print(module.bias.data)
        return forward_hook

    layer_index = 0
    for name, module in model.named_modules():
        if isinstance(module, (nn.Linear, nn.ReLU)):
            hooks.append(module.register_forward_hook(make_forward_hook(f"{name} [Layer {layer_index}]", module)))
            layer_index += 1

    return hooks


# Input and target
x = torch.tensor([[3, 4, 5], [5, 4, 3]], dtype=torch.float32, requires_grad=True)
target = torch.tensor([[1.], [2.]])

# Initialize model
model = MyNetwork()
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Register hooks
hooks = register_hooks(model, current_epoch)

# Train for 2 epochs
for epoch in range(2):
    current_epoch['epoch'] = epoch
    print(f"\n========== EPOCH {epoch + 1} ==========")

    optimizer.zero_grad()
    output = model(x)
    loss = CustomMSELossFunction.apply(output, target)
    print("Loss:", loss.item())
    loss.backward()
    optimizer.step()

# Clean up
for h in hooks:
    h.remove()



Loss: 212.5
**********
2 2
grad_output from loss: tensor(1.)
grad_input from loss: tensor([[ -5.],
        [-20.]])


>>> Forward Output from h1 [Layer 0]
tensor([[-194.5000, -196.5000,  -99.0000, -283.0000,  299.0000],
        [-212.5000, -212.5000,  -89.0000, -289.0000,  305.0000]],
       grad_fn=<AddmmBackward0>)
>>> Weights of h1 [Layer 0] after forward:
tensor([[-20., -16., -11.],
        [-20., -15., -12.],
        [ -5.,  -8., -10.],
        [-25., -23., -22.],
        [ 26.,  25.,  23.]])
>>> Bias of h1 [Layer 0] after forward:
tensor([-15.5000, -16.5000,  -2.0000,  -6.0000,   6.0000])

>>> Forward Output from relu1 [Layer 1]
tensor([[  0.,   0.,   0.,   0., 299.],
        [  0.,   0.,   0.,   0., 305.]], grad_fn=<ReluBackward0>)

>>> Forward Output from h2 [Layer 2]
tensor([[ -8387.5000, -14069.5000],
        [ -8555.5000, -14351.5000]], grad_fn=<AddmmBackward0>)
>>> Weights of h2 [Layer 2] after forward:
tensor([[-11., -15., -21., -36., -28.],
        [-22., -24., -25., -49