### Tensor

In [1]:
import torch
x = torch.ones(2, 2, requires_grad=True)
x

tensor([[ 1.,  1.],
        [ 1.,  1.]])

In [2]:
y = x+2
y

tensor([[ 3.,  3.],
        [ 3.,  3.]])

In [3]:
y.grad_fn

<AddBackward0 at 0x106508a20>

In [4]:
z = y * y * y * 3
out = z.mean()
print(z, out)

tensor([[ 81.,  81.],
        [ 81.,  81.]]) tensor(81.)


In [5]:
z.grad_fn

<MulBackward0 at 0x10a83d4e0>

In [6]:
out.grad_fn

<MeanBackward1 at 0x10a83d588>

In [7]:
a = torch.randn(3,3)
a = ((a/2)/(a*3))
a

tensor([[ 0.1667,  0.1667,  0.1667],
        [ 0.1667,  0.1667,  0.1667],
        [ 0.1667,  0.1667,  0.1667]])

In [8]:
a.requires_grad

False

In [9]:
a.requires_grad_(True)

tensor([[ 0.1667,  0.1667,  0.1667],
        [ 0.1667,  0.1667,  0.1667],
        [ 0.1667,  0.1667,  0.1667]])

In [10]:
a.requires_grad

True

In [11]:
b = (a*a).sum()
b.grad_fn

<SumBackward0 at 0x10a83dac8>

### Gradients

In [12]:
"""
out in 4th block.
equivalent to out.backward(torch.tensor(1)) -- single value tensor -- can be assumed that 
it's the output layer.

"""
out.backward() 

In [13]:
"""
Result produced after differentiation of x using y = x+2 and z = y*y*y*3
So, z = 3(x+2)^3.
The output is d(out)/dx when x = 1 -- as given in the input.

"""
x.grad

tensor([[ 20.2500,  20.2500],
        [ 20.2500,  20.2500]])

In [14]:
# Norm
x = torch.randn(3, requires_grad = True)
x

tensor([-0.0843, -0.2857, -0.3551])

In [15]:
y = x*2
y

tensor([-0.1686, -0.5713, -0.7103])

In [16]:
y.data

tensor([-0.1686, -0.5713, -0.7103])

In [17]:
# Default is L2 Norm
y.data.norm()

tensor(0.9270)

In [18]:
while y.data.norm()<1000:
    y = y*2
y

tensor([ -345.3379, -1170.0391, -1454.5962])

In [19]:
y.grad_fn

<MulBackward0 at 0x10a83d748>

In [20]:
"""
In here, y is a vector, not a scalar. -- similar to a hidden layer -- y is a hidden layer.
Argument in the backward function - It's the gradient of the eventual downstream loss with 
respect to the current layer. So that in the case of a scalar loss which is also the “most 
downstream output/loss” we get dloss/dloss =1 but if we want to get backward() from some 
middle layer we have to provide the gradient of the downstream loss w.r.t. all the outputs 
of this middle layer (evaluated at the current values of those outputs) in order to get well 
defined numerical results. In more technical terms. Let y be an arbitrary node in a 
computational graph. If we call y.backward(arg) the argument arg to backward should be the 
gradient of the root of the computational graph with respect to y evaluated at a specific 
value of y (usually the current value of y). If y is a whole layer, this means that 
arg should provide a value for each neuron in y. If y is the final loss it is also the root 
of the graph and we get the usual scalar one as the only reasonable argument arg.

"""

gradients = torch.tensor([0.1,1.0,0.001], dtype=torch.float)
y.backward(gradients)
x.grad

tensor([  409.6000,  4096.0000,     4.0960])

In [21]:
# To stop tracking history on Tensors
x.requires_grad

True

In [22]:
(x**2).requires_grad

True

In [23]:
with torch.no_grad():
    print((x/2).requires_grad)

False
