In [6]:
import torch

a = torch.randn((3,3), requires_grad = True)

w1 = torch.randn((3,3), requires_grad = True)
w2 = torch.randn((3,3), requires_grad = True)
w3 = torch.randn((3,3), requires_grad = True)
w4 = torch.randn((3,3), requires_grad = True)

b = w1*a
c = w2*a

d = w3*b + w4*c

L = 10 - d

print("The grad fn for a is", a.grad_fn)
print(a.is_leaf) # leaf nodes do not have gradient information
print("The grad fn for d is", d.grad_fn) # addbackward = addition operation
print(d.is_leaf)

# NOTE: 
# The forward function of the grad_fn of 'd' receives inputs w3*b and w4*c
# and adds them. This value is stored in 'd'
# The backward function of the grad_fn = <AddBackward> takes the
# incoming gradient from further layers, as its own input. This is dL/dd
# coming along the edge leading from 'L' to 'd'. 
# This dL/dd is stored in the d.grad
print(d.grad) # none yet
# Then the backward function computes local gradients dd/d(w4*c) and
# dd/d(w3*b)
# Then multiplies the incoming gradient dL/dd with the local gradients
# above respectively and sends the gradients to its inputs by 
# invoking the backward method of the grad_fn of their inputs. 

The grad fn for a is None
True
The grad fn for d is <ThAddBackward object at 0x7f68171652e8>
False
None


In [None]:
# Example of workings behind backward() function in pytorch
"""
def backward(incomingGradients):
    self.Tensor.grad = incomingGradients
    
    for input in self.inputs:
        if input.grad_fn is not None: 
            newIncomingGradients = \
                incomingGradient * localGrad(self.Tensor, input)
            
            input.grad_fn.backward(newIncomingGradients)
        else:
            pass 
"""

In [7]:
# NOTE: can only call backward() on scalar-valued tensor (0-dim)
L.backward()

# This is because by definition, gradients can be computed with respect
# to SCALAR variables only. Can't differentiate a vector with respect
# to another vector here (Jacobian is key term here)

RuntimeError: grad can be implicitly created only for scalar outputs

In [19]:
# Two possible workarounds: 


# METHOD 1: setting L to the sum of all the errors:
torch.manual_seed(1)
a = torch.randn((3,3), requires_grad = True)

w1 = torch.randn((3,3), requires_grad = True)
w2 = torch.randn((3,3), requires_grad = True)
w3 = torch.randn((3,3), requires_grad = True)
w4 = torch.randn((3,3), requires_grad = True)

b = w1*a
c = w2*a

d = w3*b + w4*c

# METHOD 1: setting L to the sum of all the errors:
L = 10 - d
print("L1: ", L)
L = (10 - d).sum()
print("L1 sum: ", L)
L.backward()
print("w1.grad: ", w1.grad)
print("w2.grad: ", w2.grad)
print("w3.grad: ", w3.grad)
print("w4.grad: ", w4.grad)

L1:  tensor([[11.8059, 10.0897, 10.1741],
        [10.1589,  9.9698,  9.8614],
        [ 8.7667, 10.2584,  9.1030]], grad_fn=<AddBackward>)
L1 sum:  tensor(90.1878, grad_fn=<SumBackward0>)
w1.grad:  tensor([[-1.6723, -0.1096,  0.0609],
        [ 0.5642,  0.2451,  0.0183],
        [-3.4399, -0.2316, -0.1421]])
w2.grad:  tensor([[-0.5496,  0.0661,  0.0495],
        [-0.1470,  0.1291,  0.1146],
        [-0.9640, -0.3357, -0.7031]])
w3.grad:  tensor([[ 0.3724,  0.2382,  0.0036],
        [ 0.1215, -0.4364,  0.0702],
        [ 0.4071,  0.1608, -0.5248]])
w4.grad:  tensor([[ 1.0401,  0.0329, -0.2212],
        [ 1.1378,  0.7225, -0.2121],
        [ 0.4957,  0.1829,  1.4171]])


In [20]:
# METHOD 2: 
torch.manual_seed(1)
a = torch.randn((3,3), requires_grad = True)

w1 = torch.randn((3,3), requires_grad = True)
w2 = torch.randn((3,3), requires_grad = True)
w3 = torch.randn((3,3), requires_grad = True)
w4 = torch.randn((3,3), requires_grad = True)

b = w1*a
c = w2*a

d = w3*b + w4*c

# METHOD 2: if there is some reason you need to call backward on a 
# vector function then you can pass a torch.ones of the same size and
# shape of teh tensor you are trying to call backward with
L = 10 - d
L.backward(torch.ones(L.shape))

print("w1.grad: ", w1.grad)
print("w2.grad: ", w2.grad)
print("w3.grad: ", w3.grad)
print("w4.grad: ", w4.grad)

w1.grad:  tensor([[-1.6723, -0.1096,  0.0609],
        [ 0.5642,  0.2451,  0.0183],
        [-3.4399, -0.2316, -0.1421]])
w2.grad:  tensor([[-0.5496,  0.0661,  0.0495],
        [-0.1470,  0.1291,  0.1146],
        [-0.9640, -0.3357, -0.7031]])
w3.grad:  tensor([[ 0.3724,  0.2382,  0.0036],
        [ 0.1215, -0.4364,  0.0702],
        [ 0.4071,  0.1608, -0.5248]])
w4.grad:  tensor([[ 1.0401,  0.0329, -0.2212],
        [ 1.1378,  0.7225, -0.2121],
        [ 0.4957,  0.1829,  1.4171]])


In [21]:
# Notice how backward used to take incoming gradients as it's input.
# Doing the above makes the backward think that incoming gradient are 
# just Tensor of ones of same size as L, and it's able to backpropagate.
# 
# In this way, we can have gradients for every Tensor , and we can 
# update them using Optimisation algorithm of our choice.
learningRate = 0.5
w1 = w1 - learningRate * w1.grad


In [22]:
w1

tensor([[ 0.2731, -0.8375, -0.0887],
        [-0.4776, -1.0882,  0.4133],
        [ 1.9873, -0.3054, -0.4396]], grad_fn=<ThSubBackward>)

Tidbit on gradient accumulation and retain_graph argument:
https://hyp.is/q_cZGAFrEeqh-IuJztoNmg/blog.paperspace.com/pytorch-101-understanding-graphs-and-automatic-differentiation/