In [1]:
import torch

A simple function y = $2x^Tx$

the column is vector X 

In [2]:
x = torch.arange(4.0)
x 

# created array

tensor([0., 1., 2., 3.])

In [3]:
x.requires_grad_(True)
x.grad #The gradient is None by default

# activated gradient so we can back track

In [4]:
y = 2* torch.dot(x,x) # no transpose bcs column is vector 
y 

# y = 2* torch.dot(x.T,x)  # answer is same

tensor(28., grad_fn=<MulBackward0>)

In [5]:
y.backward()
x.grad

# he gradient of y with respect to x by calling its backward method. 
# Next, we can access the gradient via x’s grad attribute.

tensor([ 0.,  4.,  8., 12.])

In [6]:
x = torch.arange(4.0, requires_grad=True)
y = 2 * torch.dot(x, x)
y.backward()
x.grad


# Record path → requires_grad
# Go backward → backward()
# Compute ∂ → chain rule
# Collect ∂ → gradient

tensor([ 0.,  4.,  8., 12.])

 Given
$$
y = 2(x \cdot x)
$$

 Introduce Intermediate Variable
$$
u = x \cdot x
$$
$$
y = 2u
$$

 Derivative of Outer Function
$$
\frac{dy}{du} = 2
$$

 Expand Inner Function
$$
u = x_0^2 + x_1^2 + x_2^2 + x_3^2
$$

 Partial Derivative of Inner Function
$$
\frac{\partial u}{\partial x_i} = 2x_i
$$

 Apply Chain Rule
$$
\frac{\partial y}{\partial x_i}
=
\frac{dy}{du}
\cdot
\frac{\partial u}{\partial x_i}
=
2 \cdot 2x_i
=
4x_i
;
4x_i
=>
4x_0, 
4x_1, 
4x_2,
4x_3
$$

 Gradient
$$
\nabla_x y =
\begin{bmatrix}
4x_0 \\
4x_1 \\
4x_2 \\
4x_3
\end{bmatrix}
$$

 Final Result
$$
x.\text{grad} = 4x
$$

> **Chain rule computes each partial derivative; the gradient collects them.**


y = $2x^Tx$ x should be 4x

In [7]:
x.grad == 4 * x

tensor([True, True, True, True])

In [8]:
x

tensor([0., 1., 2., 3.], requires_grad=True)

In [9]:
x.grad.zero_()  # Reset the gradient
y = x.sum()
y.backward()
x.grad

tensor([1., 1., 1., 1.])

### Backward for Non-Scalar Variables

In [10]:
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [11]:
x.grad.zero_()
y = x * x
y.backward(gradient=torch.ones(len(y)))  # Faster: y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

In [12]:
x.grad.zero_()
y = x * x
y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

### Detaching Computation

In [13]:
x.grad.zero_()
y = x * x
u = y.detach() 
# detach() changes which paths gradients are allowed to flow through, 
# so the derivative you compute is different.
z = u * x

z.sum().backward()
x.grad == u

tensor([True, True, True, True])

In [14]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

tensor([True, True, True, True])

### Gradients and Python Control Flow

In [15]:
def f(a):
    b = a * 2
    while b.norm() < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c 

In [48]:
a = torch.randn(size=() ,requires_grad=True)
print(a)
d= f(a)
# print(d)
d.backward()
e = d/a
print(e)
a.grad == e 

tensor(0.1579, requires_grad=True)
tensor(8192., grad_fn=<DivBackward0>)


tensor(True)