In [1]:
import torch
import numpy as np

In [2]:
model = torch.nn.Sequential(
    torch.nn.Linear(2, 2, bias=True),
    torch.nn.ReLU(),
    torch.nn.Linear(2, 1, bias=False)
)

In [3]:
features = torch.tensor(np.random.normal(size=(32, 2)), dtype=torch.float)
labels = torch.cos(torch.sum(features, dim=-1, keepdim=True))

# Try one at a time

In [4]:
model.train()

pred = model.forward(features)
loss = torch.nn.functional.mse_loss(pred, labels)
loss.backward()

grads = {}
for name,param in model.named_parameters():
    grads[name] = param.grad.clone()

model.zero_grad()

grads

{'0.weight': tensor([[ 0.0018,  0.0008],
         [ 0.0815, -0.0533]]),
 '0.bias': tensor([-0.0008,  0.1022]),
 '2.weight': tensor([[0.0107, 0.0012]])}

# Try multiple times

In [5]:
model.train()

for _ in range(10):
    pred = model.forward(features)
    loss = torch.nn.functional.mse_loss(pred, labels)
    loss.backward()

grads = {}
for name,param in model.named_parameters():
    grads[name] = param.grad.clone()

model.zero_grad()

grads

{'0.weight': tensor([[ 0.0180,  0.0076],
         [ 0.8152, -0.5327]]),
 '0.bias': tensor([-0.0077,  1.0222]),
 '2.weight': tensor([[0.1066, 0.0117]])}

# Approx Fisher

In [14]:
model.train()

pred = model.forward(features)
loss = torch.nn.functional.mse_loss(pred, labels)
loss.backward()

grads = {}
for name,param in model.named_parameters():
    grads[name] = torch.square(param.grad.clone()) / 32.0

model.zero_grad()

print(torch.mean(torch.sum(torch.square(pred.detach() - labels), dim=1)).numpy().round(4) * 32.)
print(torch.mean(torch.sum(torch.square(pred.detach() - labels), dim=1)).numpy().round(4))
print(loss.detach().numpy().round(4))

grads

15.5936
0.4873
0.4873


{'0.weight': tensor([[1.0152e-07, 1.8115e-08],
         [2.0765e-04, 8.8678e-05]]),
 '0.bias': tensor([1.8462e-08, 3.2651e-04]),
 '2.weight': tensor([[3.5499e-06, 4.3038e-08]])}

In [15]:
model.train()

pred = model.forward(features)
loss = torch.square(pred - labels)
loss.backward()

grads = {}
for name,param in model.named_parameters():
    grads[name] = torch.square(param.grad.clone()) / 32.0

model.zero_grad()

print(loss.detach().numpy().round(4))

grads

RuntimeError: grad can be implicitly created only for scalar outputs

In [None]:
model.train()

for i in range(32):
    pred = model.forward(features[[i]])
    loss = torch.sum(torch.square(pred - labels[[i]]))
    loss.backward()

# print(pred.shape, loss.shape, features[[i]].shape, labels[[i]].shape) # WORKS!

grads = {}
for name,param in model.named_parameters():
    grads[name] = torch.square(param.grad.clone()) / 32.0

model.zero_grad()

print(loss.detach().numpy().round(4))

grads

torch.Size([1, 1]) torch.Size([]) torch.Size([1, 2]) torch.Size([1, 1])
0.8264


{'0.weight': tensor([[4.1582e-04, 7.4200e-05],
         [8.5054e-01, 3.6322e-01]]),
 '0.bias': tensor([7.5622e-05, 1.3374e+00]),
 '2.weight': tensor([[0.0145, 0.0002]])}

In [None]:
# THIS IS THE CORRECT WAY TO DO THIS! NEED TO SQUARE GRADS, THEN SUM

model.train()

grads = {}

for i in range(32):
    pred = model.forward(features[[i]])
    loss = torch.sum(torch.square(pred - labels[[i]]))
    loss.backward()

# print(pred.shape, loss.shape, features[[i]].shape, labels[[i]].shape) # WORKS!

    for name,param in model.named_parameters():
        if i == 0:
            grads[name] = torch.square(param.grad.clone()) / 32.0
        else:
            grads[name] += torch.square(param.grad.clone()) / 32.0

model.zero_grad()

print(loss.detach().numpy().round(4))

grads

0.8264


{'0.weight': tensor([[3.6525e-03, 8.1083e-04],
         [2.2168e+00, 9.9273e-01]]),
 '0.bias': tensor([1.3677e-03, 3.1243e+00]),
 '2.weight': tensor([[0.0915, 0.2735]])}