GRADIENT CALCULATION WITH AUTOGRAD

In [13]:
import torch

x = torch.randn(3,requires_grad = True)  #let's say later we want to calculate gradient of some function with respect to x
print(x)  # by default requires_grad = false
# Now whenever we do any operation with this tensor pytorch will create a computational graph 
# Then using back propagation we can calculate the gradients

tensor([ 1.2320, -0.0858, -0.3742], requires_grad=True)


In [14]:
y = x+2
print(y)

tensor([3.2320, 1.9142, 1.6258], grad_fn=<AddBackward0>)


In [15]:
z = y*y*2
print(z)

tensor([20.8917,  7.3285,  5.2864], grad_fn=<MulBackward0>)


In [16]:
z = z.mean()
print(z)

tensor(11.1689, grad_fn=<MeanBackward0>)


In [17]:
#Now to calculate gradient of z with respect to x
z.backward() #dz/dx
print(x.grad)
print(y.grad)

tensor([4.3093, 2.5523, 2.1677])
None


  print(y.grad)


In [18]:
#now if we did not apply the mean operation in the end then z.backward() will produce an error
x = torch.randn(3,requires_grad = True) 
y = x+2
z = y*y*2
print(z)
z.backward() #dz/dx



tensor([ 2.7766, 18.4878, 12.1180], grad_fn=<MulBackward0>)


RuntimeError: grad can be implicitly created only for scalar outputs

So we see that, we can call z.backward() if z is a scallar value

Another solution is...

In [19]:
v = torch.tensor([0.1,0.2,0.001],dtype = torch.float32) #create a same size vector as v
z.backward(v) #dz/dx
print(x.grad)

tensor([0.4713, 2.4323, 0.0098])


HOW TO PREVENT PYTORCH FROM TRACKING HISTORY & CALCULATING grad_fn ATTRIBUTE

In [20]:
#sometimes in our training loop when we update weights those operations should not be part of our gradient computation
#There are 3 options for this
#1. x.requires_grad_(False)
#2. x.detach() will create a new tensor with same values but that does not require gradient
#3. wrap this in a with statement -> with torch.no_grad():

# Method 1
x = torch.randn(3,requires_grad = True)  
print(x)
x.requires_grad_(False) #trailing underscore inplace operation 
print(x)


tensor([ 0.1283,  0.7965, -0.0334], requires_grad=True)
tensor([ 0.1283,  0.7965, -0.0334])


In [21]:
#Method 2
x = torch.randn(3,requires_grad = True)  
print(x)
y = x.detach()
print(y)


tensor([-0.8754, -0.5545, -0.9589], requires_grad=True)
tensor([-0.8754, -0.5545, -0.9589])


In [22]:
x = torch.randn(3,requires_grad = True)  
y = x+2
print(y)
#Method 3

with torch.no_grad():
    y = x+2
    print(y)

tensor([2.1812, 3.7468, 0.9191], grad_fn=<AddBackward0>)
tensor([2.1812, 3.7468, 0.9191])


Why do we need to clear the gradients before the next epoch

In [23]:
weights = torch.ones(4,requires_grad =True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


Here we can see that after 2nd epoch we are getting gradients as summation of gradients of first 2 epochs , and after 3rd epoch we are getting gradients as summation of gradients first 3 epochs

In [26]:
#Solution
weights = torch.ones(4,requires_grad =True)
for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    
    weights.grad.zero_()  #clearing the gradients inplace

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


SIMILARLY WE HAVE TO CLEAR THE GRADIENTS WHEN WE WORK WITH OPTIMIZERs

In [28]:
# weights = torch.ones(4,requires_grad =True)

# optimizer = torch.optim.SGD(weights,lr=0.01)
# optimizer.step()
# optimizer.zero_grad()