In [1]:
import torch

In [4]:
## create a normal tensor
## By default the tensor has no gradient storage.

x = torch.rand(5)
print(x, x.grad, x.requires_grad)

tensor([0.7324, 0.2427, 0.9295, 0.0753, 0.8384]) None False


## When final output is a scalar

In [5]:
## we need to set requires_grad value, to store the gradient
## it generates a computational graph of all the opeartions
## carried on that tensor, and calculates gradient.

x = torch.rand(5, requires_grad = True)
print(x)

## sequence of operations
y = x + 2
z = y * y * 2
z = z.mean()

## this will add gradient computation function to each of the variables
print(y)
print(z)

## .backward() will trigger the gradient propagation
z.backward() ## calculate dz/dx

## propagate till x (the input)
print(x.grad)

tensor([0.5811, 0.4933, 0.3618, 0.4305, 0.6741], requires_grad=True)
tensor([2.5811, 2.4933, 2.3618, 2.4305, 2.6741], grad_fn=<AddBackward0>)
tensor(12.6058, grad_fn=<MeanBackward0>)
tensor([2.0649, 1.9946, 1.8894, 1.9444, 2.1393])


## When final output is a vector

In [8]:
## we need to set requires_grad value, to store the gradient
## it generates a computational graph of all the opeartions
## carried on that tensor, and calculates gradient.

x = torch.rand(5, requires_grad = True)
print(x)

## sequence of operations
y = x + 2
z = y * y * 2

## this will add gradient computation function to each of the variables
print(y)
print(z)

## .backward() will trigger the gradient propagation
## but when z is a vector, we need to pass a vector when calling .backward()
## it should be of the same size as z.

v = torch.ones(5, dtype=torch.float32)
z.backward(v) ## calculate dz/dx

## propagate till x (the input)
print(x.grad)

tensor([0.6953, 0.3307, 0.0088, 0.0932, 0.3140], requires_grad=True)
tensor([2.6953, 2.3307, 2.0088, 2.0932, 2.3140], grad_fn=<AddBackward0>)
tensor([14.5297, 10.8647,  8.0704,  8.7628, 10.7094], grad_fn=<MulBackward0>)
tensor([10.7814,  9.3230,  8.0351,  8.3727,  9.2561])


## When we do not need to compute gradient

In [10]:
## Example: when we need to update the weights with the gradient.

## There are 3 ways to do that.

## 1. .requires_grad_()

x = torch.randn(5, requires_grad = True)
print(x)
x.requires_grad_(False)
print(x)
print("-"*30)

## 2. .detach()
## It creates a new copy of no gradient tensor.

x = torch.randn(5, requires_grad = True)
print(x)
y = x.detach()
print(y)
print("-"*30)

## 3. with torch.no_grad():

x = torch.randn(5, requires_grad = True)
print(x)
with torch.no_grad():
    y = x+2
    print(y)


tensor([ 2.5680,  0.0068, -0.6652,  1.9122, -0.7000], requires_grad=True)
tensor([ 2.5680,  0.0068, -0.6652,  1.9122, -0.7000])
------------------------------
tensor([ 1.0561, -0.1023,  2.0452,  0.4143,  0.4173], requires_grad=True)
tensor([ 1.0561, -0.1023,  2.0452,  0.4143,  0.4173])
------------------------------
tensor([ 0.6888, -0.9357, -1.0613, -1.4771,  0.6907], requires_grad=True)
tensor([2.6888, 1.0643, 0.9387, 0.5229, 2.6907])


## Empty gradient after each loop

In [12]:
## Pytorch .grad attribute accumulates gradient for each iteration.
## Example:

weights = torch.ones(4, requires_grad=True)
for epoch in range(3):
    model_output = (weights*3).sum() # to get a scalar value
    model_output.backward()
    print(weights.grad)

## For proper execution and not accumulating the grads
## zero out the grad for each iteration.

print("-"*30)

weights.grad.zero_()
for epoch in range(3):
    model_output = (weights*3).sum() # to get a scalar value
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])
------------------------------
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
