In [1]:
import numpy as np
import torch

# Auto Gradient

By enabling ``grad`` for ``torch.Tensor``, we are enabling option that ``torch`` will calculate gradient for all leaf nodes, and store them in ``.grad`` attribute of Tensor.

Non-Leaf node needs to be scalar if we want to compute gradient of leaf nodes.

This is usually the case, since leaf node is some loss function, which is nothing but a scalar.

When we say, gradients will be calculated for non-leaf nodes, we mean gradinets of the leaf node with respect to all elements of all non-leaf nodes (e.g. leaf node is some loss function denoted as $L$).

This is used during training, where we need gradients of Loss function $L$ with respect to all parameters to optimize our parameters.

### Example 1
``c = torch.sum(a + b) ``

If we enable gradient calculations for ``a`` and ``b``, following gradients will be calculated:
 - ``dc/da`` - for each element of a indenpendently
 - ``dc/db`` - for each element of b indenpendently

``c`` needs to be scalar (e.g. for training network that is our Loss Function - which is scalar)

In [2]:
a = torch.randn(2, 2, requires_grad=True)
# a.requires_grad_()
# a.requires_grad = True
b = torch.randn(2, 2, requires_grad=True)
c = torch.sum(a + b)

In [3]:
c.backward()

In [4]:
a.grad

tensor([[1., 1.],
        [1., 1.]])

In [5]:
b.grad

tensor([[1., 1.],
        [1., 1.]])

In [6]:
c.grad

  c.grad


In [7]:
a

tensor([[ 1.0796,  0.3734],
        [ 0.4797, -1.6013]], requires_grad=True)

In [8]:
b

tensor([[-0.8128, -1.8824],
        [ 1.6154, -0.2534]], requires_grad=True)

In [9]:
c

tensor(-1.0019, grad_fn=<SumBackward0>)

### Example 2

``b = a ** 2``

``c = b.mean()``

``grad`` will be computed only for ``a`` not for ``b`` (I was expecting to be computed for ``b`` as well (clarify this)/

In [10]:
# Forward pass
a = torch.ones((2,2)).requires_grad_()
b = a ** 2
c = b.mean()

In [11]:
# Backward pass
c.backward()

In [12]:
a.grad

tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])

In [13]:
b.grad

  b.grad


In [14]:
c.grad

  c.grad


In [15]:
a

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)

In [16]:
b

tensor([[1., 1.],
        [1., 1.]], grad_fn=<PowBackward0>)

In [17]:
c

tensor(1., grad_fn=<MeanBackward0>)

#### Summary Note: 
When you finish your ``forward pass`` you call ```.backward()``` and compute backpropagation and all the gradients will becomputed automatically. The gradient for tensor will be accumulated into ```.grad``` attribute.

### Example 3

In [18]:
x1 = torch.Tensor([1]).requires_grad_()
x2 = torch.Tensor([2]).requires_grad_()

In [19]:
y1 = (5*x1) + (3*x2)
y2 = (2*x1) + (1*x2)

In [20]:
z = y1*y2

In [21]:
print(x1)
print(x2)
print(y1)
print(y2)
print(z)

tensor([1.], requires_grad=True)
tensor([2.], requires_grad=True)
tensor([11.], grad_fn=<AddBackward0>)
tensor([4.], grad_fn=<AddBackward0>)
tensor([44.], grad_fn=<MulBackward0>)


In [22]:
z.backward()

In [23]:
print(x1.grad)
print(x2.grad)
print(y1.grad)
print(y2.grad)
print(z.grad)

tensor([42.])
tensor([23.])
None
None
None


  print(y1.grad)
  print(y2.grad)
  print(z.grad)


In [24]:
print(x1.grad)
print(x1.grad_fn)
print('------------------')
print(x2.grad)
print(x2.grad_fn)
print('------------------')
print(y1.grad)
print(y1.grad_fn)
print('------------------')
print(y2.grad)
print(y2.grad_fn)
print('------------------')
print(z.grad)
print(z.grad_fn)

tensor([42.])
None
------------------
tensor([23.])
None
------------------
None
<AddBackward0 object at 0x7f033cf78ee0>
------------------
None
<AddBackward0 object at 0x7f033cf78a30>
------------------
None
<MulBackward0 object at 0x7f033cf78d90>


  print(y1.grad)
  print(y2.grad)
  print(z.grad)


#### **Important:** Future calls to backward will accumulate gradients into this vector

#### - $y = \frac{1}{4} \sum_{i=1}^{4} 2 \cdot x_i $
#### - $\frac{\partial y}{\partial x_i} = \frac{1}{2}$

In [25]:
x = torch.rand((2, 2)).requires_grad_()
for i in range(3):
    y = (2 * x).mean()
    y.backward()
    print(f"Gradients at iteration {i+1}:")
    print(f"{x.grad}\n")
    x.grad.zero_()

Gradients at iteration 1:
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])

Gradients at iteration 2:
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])

Gradients at iteration 3:
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])



### Gradient Calculation
 - $ y = \frac{1}{N} \sum_{i=1}^{N} u_i + 2 \cdot v_i$ <br><br>
 - $ \frac{\partial y}{\partial u_i} = \frac{1}{N}$ <br> <br>
 - $ \frac{\partial y}{\partial v_i} = \frac{2}{N}$

In [26]:
tensor_one = torch.tensor([[1., 2.], [3., 4.]], requires_grad=True)
tensor_two = torch.tensor([[5., 6.], [7., 8.]], requires_grad=True)  

# By default, new tensor which is operation of some other tensor whose requires_grad is True, will also have requires_grad=True)
final_tensor = (tensor_one + 2 * tensor_two).mean()
print(final_tensor)
print(final_tensor.requires_grad)
print(final_tensor.grad_fn)
print()
print(tensor_one.requires_grad)
print(tensor_one.grad_fn)

tensor(15.5000, grad_fn=<MeanBackward0>)
True
<MeanBackward0 object at 0x7f033cf78850>

True
None


In [27]:
final_tensor.backward()

# tensor_one = [[x1,x2],[x3,x4]]
# tensor_one.grad = [[d_ft/d_x1, d_ft/d_x2], [d_ft/d_x3, d_ft/d_x4]]
print(f"{tensor_one.grad = }")

print(f"{tensor_two.grad = }")

print(f"{final_tensor.grad = }")

tensor_one.grad = tensor([[0.2500, 0.2500],
        [0.2500, 0.2500]])
tensor_two.grad = tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])
final_tensor.grad = None


  print(f"{final_tensor.grad = }")


### Enabling and Disabling Gradient Calculations 
You can also stops autograd from tracking history on newly created tensors with *requires_grad=True* by: 
 - wrapping the code block in **with torch.no_grad()**
 - x.reguires_grad_(False)
 - x.detach()

In [28]:
tensor1 = torch.Tensor([[1, 2, 3], 
                        [4, 5, 6]])
tensor1.requires_grad = True

In [29]:
with torch.no_grad():
    new_tensor = tensor1 * 3
    print('new_tensor = ', new_tensor)
    print('requires_grad for tensor1 = ', tensor1.requires_grad)
    print('requires_grad for new_tensor = ', new_tensor.requires_grad)

new_tensor =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
requires_grad for tensor1 =  True
requires_grad for new_tensor =  False


In [30]:
tensor1.requires_grad_(False)
tensor1

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [31]:
tensor1.requires_grad_(True)
tensor1

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

In [32]:
new_tensor = tensor1.detach()
print(tensor1)
print(new_tensor)

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)
tensor([[1., 2., 3.],
        [4., 5., 6.]])


### Function that takes grad tensor but do not take gradient calculation into account

In [33]:
@torch.no_grad()
def mult_by_three(x):
    y = x * 3
    print('new_tensor = ', y)
    print('requires_grad for tensor1 = ', x.requires_grad)
    print('requires_grad for new_tensor = ', y.requires_grad)
mult_by_three(tensor1)

new_tensor =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
requires_grad for tensor1 =  True
requires_grad for new_tensor =  False
