In [21]:
import numpy as np
np.random.seed(0)
np.random.choice(5,3)

array([4, 0, 3])

In [62]:
np.random.choice(10,3)

array([6, 3, 3])

# torch 动态计算图

先建立张量 `a` 、 `b` 和 `loss` 的前向传播：

In [11]:
import torch

a = torch.tensor([3.0, 1.0], requires_grad=True)
b = a * a
loss = b.mean()
loss2 = loss ** 2

此时还没有调用 `.backward()` 方法，这时打印叶子节点的梯度：

In [2]:
print(a.grad)

None


发现为空。现在调用 `.backward()` 方法，这时再次打印叶子节点的梯度：

In [3]:
loss.backward()
print(a.grad)

tensor([3., 1.])


现在叶子节点有了梯度，但非叶子节点的梯度因为被自动清空（节省显存）而仍为空。但如果此时再次调用 `.backward()` 就会报告会 `RuntimeError`。这是这样因为Pytorch是动态计算图机制，一次backward后计算图就清空了。

In [4]:
loss.backward()
print(a.grad)

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

# 尝试在前向传播之后仅替换loss的值，而继续利用已经建立的计算图

In [54]:
import torch

a = torch.tensor([2.0, 1.0], requires_grad=True)
b = a ** 2
loss = b.mean()
loss2 = loss ** 2

loss2.register_hook(lambda grad: print('loss2 grad: ', grad))
loss2.backward()
a.grad, loss.grad

loss2 grad:  tensor(1.)


  a.grad, loss.grad


(tensor([10.,  5.]), None)

In [57]:
import torch

a = torch.tensor([2.0, 1.0], requires_grad=True)
b = a ** 2
loss = b.mean()
loss2 = loss ** 2
loss2.data = torch.tensor([3.4], requires_grad=False)
loss2.register_hook(lambda grad: print('loss2 grad: ', grad))
loss2.backward()
a.grad

loss2 grad:  tensor(1.)


tensor([10.,  5.])

In [64]:
# 用矩阵乘法举例子
w = torch.tensor([[1., 2.],
              [0., 2.]], requires_grad=True)
b = torch.tensor([1., 1.], requires_grad=True)

x = torch.tensor([1., 1.], requires_grad=False)

y = torch.tensor([3., 3.]) # y 不显式指定requires_grad，从结果可以知道，默认该项为False

y_pred = w @ x + b

loss = (y - y_pred).pow(2).mean()

loss

tensor(0.5000, grad_fn=<MeanBackward0>)

In [65]:
loss.backward()
w.grad, b.grad, x.grad, y.grad

(tensor([[1., 1.],
         [-0., -0.]]),
 tensor([1., -0.]),
 None,
 None)

In [66]:
# 用矩阵乘法举例子
w = torch.tensor([[1., 2.],
              [0., 2.]], requires_grad=True)
b = torch.tensor([1., 1.], requires_grad=True)

x = torch.tensor([1., 1.], requires_grad=True)

y = torch.tensor([3., 3.]) # y 显式指定requires_grad=True，则数据的梯度也被计算了，但是这不会有太大问题，因为1,数据一般都是False的，2,即使True，由于optimizer并未管理这一部分张量，实际上只是多存储了一部分梯度而已，但不会导致计算错误

y_pred = w @ x + b

loss = (y - y_pred).pow(2).mean()

loss.backward()
w.grad, b.grad, x.grad, y.grad

(tensor([[1., 1.],
         [-0., -0.]]),
 tensor([1., -0.]),
 tensor([1., 2.]),
 None)

In [72]:
# 用矩阵乘法举例子
w = torch.tensor([[1., 2.],
              [0., 2.5]], requires_grad=True)
b = torch.tensor([1., 1.], requires_grad=True)

x = torch.tensor([1., 1.])

y = torch.tensor([3., 3.]) 

y_pred = w @ x + b

loss = (y - y_pred).pow(2).mean()
# loss.data = torch.tensor([3.4], requires_grad=False)

loss.backward()
w.grad, b.grad, x.grad, y.grad, loss

(tensor([[1.0000, 1.0000],
         [0.5000, 0.5000]]),
 tensor([1.0000, 0.5000]),
 None,
 None,
 tensor(0.6250, grad_fn=<MeanBackward0>))

In [71]:
# 用矩阵乘法举例子
w = torch.tensor([[1., 2.],
              [0., 2.5]], requires_grad=True)
b = torch.tensor([1., 1.], requires_grad=True)

x = torch.tensor([1., 1.])

y = torch.tensor([3., 3.]) 

y_pred = w @ x + b

loss = (y - y_pred).pow(2).mean()
loss.data = torch.tensor([3.4], requires_grad=False)

loss.backward()
w.grad, b.grad, x.grad, y.grad, loss

(tensor([[1.0000, 1.0000],
         [0.5000, 0.5000]]),
 tensor([1.0000, 0.5000]),
 None,
 None,
 tensor([3.4000], grad_fn=<MeanBackward0>))

In [67]:
# 用矩阵乘法举例子
w = torch.tensor([[1., 2.],
              [0., 2.]], requires_grad=True)
b = torch.tensor([1., 1.], requires_grad=True)

x = torch.tensor([1., 1.], requires_grad=True) # x 显式指定requires_grad=True，则数据的梯度也被计算了，但是这不会有太大问题，因为1,数据一般都是False的，2,即使True，由于optimizer并未管理这一部分张量，实际上只是多存储了一部分梯度而已，但不会导致计算错误

y = torch.tensor([3., 3.]) 

y_pred = w @ x + b

loss = (y - y_pred).pow(2).mean()
loss.data = torch.tensor([3.4], requires_grad=False)

loss.backward()
w.grad, b.grad, x.grad, y.grad

(tensor([[1., 1.],
         [-0., -0.]]),
 tensor([1., -0.]),
 tensor([1., 2.]),
 None)