# 1. Imports

In [1]:
# importing other dependencies
import numpy as np

In [2]:
# importing PyTorch
import torch

# checks whether MPS is available
print(torch.backends.mps.is_available())

# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

# setting the device to "mps" instead of default "cpu"
device = torch.device("mps" if torch.backends.mps.is_available else "cpu")

True
True


# 2. Autograd Package

The Autograd Package provides automatic differentiation for all operations on Tensors.

Setting the argument `requires_grad = True` in a tensor, tracks all the operations on the corresponding tensor, so that the gradient of the tensor could be called out at any step (via Backpropagation).

# 3. Calculating the gradients

In [3]:
# a tensor with required_grad set to True
x = torch.rand(3, requires_grad=True)
print(x)

tensor([0.2020, 0.1253, 0.0450], requires_grad=True)


In [5]:
# we do various operations on x and set it to variable y check the gradients
# since y was created as a result of an operation on x, it has a grad_fn attribute
# grad_fn references a funcition that has created the tensor

# addition
y = x+2
print(y)
print(y.grad_fn)
# grad_fn: AddBackward

# multiplication
y = 2*x
print(y)
print(y.grad_fn)
# grad_fn: MulBackward

tensor([2.2020, 2.1253, 2.0450], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x10a6d9cc0>
tensor([0.4040, 0.2506, 0.0900], grad_fn=<MulBackward0>)
<MulBackward0 object at 0x11e6e0700>


We now compute the gradients with backpropagation. When we finish our computation we can call `.backward()` and have all the gradients computed automatically.

The gradient for this tensor will be accumulated into `.grad` attribute. It is the partial derivate of the function w.r.t. the tensor.

In [8]:
# We can calculate gradients of a function (say z) w.r.t. x (i.e. dz/dx) via backpropagation, by calling `z.backward()` and then calling `x.grad`
y = 3*x*x
z = y.mean()
print(z)
print(z.grad_fn)

# calculating dz/dx
z.backward()
print(x.grad)

# z.backward() doesn't require an argument in this case because the function has only one output (gradient is calculated at the value of the point 'x', which we have initialized). 

# If z gave a multi-variate output, then we have to specify an argument (input) to output the gradient by multiplying the input to the Jacobian matrix to get the output (w.r.t. the point 'x', which we have initialized).


tensor(0.0585, grad_fn=<MeanBackward0>)
<MeanBackward0 object at 0x105d5fc10>
tensor([0.4040, 0.2506, 0.0900])


`torch.autograd` is an engine for computing *vector-Jacobian* product. It computes partial derivates while applying the chain rule.

*Model with scalar output (exactly 1 element as output):*

In [9]:
y = 3*x*x
z = y/3
print(z)
print(z.grad_fn)

# calculating dz.dx
z.backward()
print(x.grad)
# this gives an error

tensor([0.0408, 0.0157, 0.0020], grad_fn=<DivBackward0>)
<DivBackward0 object at 0x105d996c0>


RuntimeError: grad can be implicitly created only for scalar outputs

*Model with non-scalar output:*

- If a Tensor is non-scalar (more than 1 elements), we need to specify *arguments* for `backward()` specify a gradient argument that is a tensor of matching shape (which is needed for vector-Jacobian product).

In [10]:
y = 3*x*x
z = y/3
print(z)
print(z.grad_fn)

# calculating gradient by multiplying the argument by the Jacobian matrix
z.backward(torch.IntTensor([1,1,1]))
print(x.grad)

tensor([0.0408, 0.0157, 0.0020], grad_fn=<DivBackward0>)
<DivBackward0 object at 0x11e7fd9c0>
tensor([0.8081, 0.5012, 0.1800])


# 4. Stop tracking history of gradients

During our training loop when we want to update our weights. This update operation should not be part of the gradient computation. We can make sure to not keep track of this calculation (to exclude this in gradient calculations) by using one of the following methods:
- `x.requires_grad_(False)`
- `x.detach()`
- wrap in `with torch.no_grad():`

**4.1** `.requires_grad_(..)` changes an existing flag in-place:

In [22]:
a = torch.randn(2,2)
print(a.requires_grad)

b = ((a*3)/(a-1))
print(b.requires_grad)
print(b.grad_fn)

False
False
None


In [23]:
a.requires_grad_(True)
print(a.requires_grad)

b = (a*a).sum()
print(b.requires_grad)
print(b.grad_fn)

True
True
<SumBackward0 object at 0x1225b3520>


**4.2** `.detach()` get a new Tensor with the same content but no gradient computation:

In [15]:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)

b = a.detach()
print(b.requires_grad)

# in-place
a.detach_()
print(a.requires_grad)

True
False
False


**4.3** wrap in `with torch.no_grad()`:

In [24]:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)

with torch.no_grad():
    b = (a * 2).sum()
    print(b.requires_grad)
    print(b.grad_fn)

True
False
None


**4.4** Why do we need to stop tracking the history of gradients?

The `.backward()` call accumulates (updates the gradient by adding the new gradient to the previous existing gradient) the gradient for this tensor into `.grad` attribute.

**!!! Need to be careful during optimization !!!**

Use `.zero_()` to empty the gradients before a *new optimization step*!

In [29]:
weights = torch.ones(4, requires_grad=True)
print(weights)

# this instance is without resetting the grads to zero

for epoch in range(3):
    # dummy operation
    model_output = (weights*3).sum()
    # computes gradients w.r.t. model_output
    model_output.backward()

    # check the gradient in each epoch
    print(weights.grad)
    # notice the grad keeps on adding (accumulating) and hence, this can't be used to update the weights

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [30]:
weights = torch.ones(4, requires_grad=True)
print(weights)

# this instance with resetting the grads to zero

for epoch in range(3):
    # dummy operation
    model_output = (weights*3).sum()
    # computes gradients w.r.t. model_output
    model_output.backward()

    print(weights.grad)

    # we can also choose to update the weights using a learning rate parameter inside a `torch.no_grad()` wrapper (this is to prevent this operation to participate in the gradient calculation)
    with torch.no_grad():
        # updating the weights
        weights -= 0.1 * weights.grad

    # resetting the gradients to zero in-place (to stop the accumulation of gradients)
    weights.grad.zero_()

print(weights)

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)


The `Optimizer` has zero_grad() method. Example:
```Python
optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
optimizer.step()
optimizer.zero_grad()
```