## 理解过程

In [1]:
import torch
import numpy

In [2]:
def sigmoid(xs):
    return 1 / ( 1 + numpy.exp(-xs))

def dsigmoid(xs):
    fx = sigmoid(xs)
    return fx * (1 - fx)

Tensor方程:

$$
\begin{align*}
y_1 &= w_{11} x_1 + w_{12} x_2 + w_{13} x_3 \\
y_2 &= w_{21} x_1 + w_{22} x_2 + w_{23} x_3 \\
a_1 &= sigmoid(y_1) = \phi(y_1) \\
a_2 &= sigmoid(y_2) = \phi(y_2) \\
l &= \dfrac{1}{2} (a_1^2 + a_2^2) \text{标量}
\end{align*}
$$

独立function求导过程 (vector-valued):

$$
\left (\dfrac{\partial{\overrightarrow{y}}}{\partial{\overrightarrow{x}}} \right )^T = 
\begin{pmatrix}
 \dfrac{\partial{y_1}}{\partial{x_1}} & \dfrac{\partial{y_2}}{\partial{x_1}} \\ 
 \dfrac{\partial{y_1}}{\partial{x_2}} & \dfrac{\partial{y_2}}{\partial{x_2}} \\ 
 \dfrac{\partial{y_1}}{\partial{x_3}} & \dfrac{\partial{y_2}}{\partial{x_3}} \\
\end{pmatrix} = 
\begin{pmatrix}
 w_{11} & w_{21} \\ 
 w_{12} & w_{22} \\ 
 w_{13} & w_{23} \\
\end{pmatrix} \tag{1}
$$

$$
\left (\dfrac{\partial{\overrightarrow{a}}}{\partial{\overrightarrow{y}}} \right )^T = 
\begin{pmatrix}
 \dfrac{\partial{a_1}}{\partial{y_1}} & \dfrac{\partial{a_2}}{\partial{y_1}} \\ 
 \dfrac{\partial{a_1}}{\partial{y_2}} & \dfrac{\partial{a_2}}{\partial{y_2}} \\ 
\end{pmatrix} = 
\begin{pmatrix}
 \phi(y_1)(1-\phi(y_1)) &  0 \\ 
 0 & \phi(y_2)(1-\phi(y_2)) \\
\end{pmatrix} \tag {2}
$$

$$
\left (\dfrac{l}{\partial{\overrightarrow{a}}} \right )^T = 
\begin{pmatrix}
 \dfrac{\partial{l}}{\partial{a_1}} \\ 
 \dfrac{\partial{l}}{\partial{a_2}} \\ 
\end{pmatrix} = 
\begin{pmatrix}
 a_1 \\ 
 a_2 \\
\end{pmatrix} = 
\begin{pmatrix}
 \phi(y_1) \\ 
 \phi(y_2) \\
\end{pmatrix} \label{3} \tag {3}
$$

有(1),(2),(3)得:
$$
\begin{align*}
\left (\dfrac{l}{\partial{\overrightarrow{x}}} \right )^T &= 
\left (\dfrac{\partial{\overrightarrow{y}}}{\partial{\overrightarrow{x}}} \right )^T 
\left (\dfrac{\partial{\overrightarrow{a}}}{\partial{\overrightarrow{y}}} \right )^T 
\left (\dfrac{l}{\partial{\overrightarrow{a}}} \right )^T =
\begin{pmatrix}
 w_{11} & w_{21} \\ 
 w_{12} & w_{22} \\ 
 w_{13} & w_{23} \\
\end{pmatrix}
\begin{pmatrix}
 \phi(y_1)(1-\phi(y_1)) &  0 \\ 
 0 & \phi(y_2)(1-\phi(y_2)) \\
\end{pmatrix}
\begin{pmatrix}
 \phi(y_1) \\ 
 \phi(y_2) \\
\end{pmatrix} \\
 &=
\begin{pmatrix}
 w_{11}\phi^2(y_1)(1-\phi(y_1)) + w_{21}\phi^2(y_2)(1-\phi(y_2)) \\ 
 w_{12}\phi^2(y_1)(1-\phi(y_1)) + w_{22}\phi^2(y_2)(1-\phi(y_2)) \\ 
 w_{13}\phi^2(y_1)(1-\phi(y_1)) + w_{23}\phi^2(y_2)(1-\phi(y_2)) \\ 
\end{pmatrix}
\end{align*} \tag{4}
$$

In [3]:
xs = torch.tensor([[1], [2], [3]], dtype=torch.float, requires_grad=True)
xs

tensor([[1.],
        [2.],
        [3.]], requires_grad=True)

In [4]:
# ws = torch.randn(3, 2, requires_grad=True)
# ws = torch.tensor([[1, 0.2], [3, 0.4], [5, 0.6]], dtype=torch.float, requires_grad=True)
# ws
# print(ws.requires_grad)
# ws = torch.t(ws) # 调用方法之后requires_grad无效
# print(ws.requires_grad) # TODO 虽然输出为True但是不能在backward后输出它的梯度, 所以使用下面的

In [5]:
ws = torch.tensor([[1, 3, 5], [0.2, 0.4, 0.6]], dtype=torch.float, requires_grad=True)
ws

tensor([[1.0000, 3.0000, 5.0000],
        [0.2000, 0.4000, 0.6000]], requires_grad=True)

In [6]:
ys = torch.mm(ws, xs)
print(ws.size(), xs.size(), ys.size())
ys

torch.Size([2, 3]) torch.Size([3, 1]) torch.Size([2, 1])


tensor([[22.0000],
        [ 2.8000]], grad_fn=<MmBackward>)

In [7]:
a_s = 1 / ( 1 + torch.exp(-ys))
a_s

tensor([[1.0000],
        [0.9427]], grad_fn=<MulBackward0>)

In [8]:
l = torch.sum(a_s * a_s)
l

tensor(1.8886, grad_fn=<SumBackward0>)

In [9]:
l.backward(retain_graph=True)

In [10]:
print(xs.grad)
print(ys.grad)
print(ws.grad)

tensor([[0.0204],
        [0.0408],
        [0.0611]])
None
tensor([[5.5789e-10, 1.1158e-09, 1.6737e-09],
        [1.0188e-01, 2.0376e-01, 3.0564e-01]])


In [11]:
# TODO, 为什么不是xs.grad, 而是ws.grad[1]
phi1 = sigmoid(22)
phi2 = sigmoid(2.8)

print(1*phi1*phi1*(1-phi1) + 0.2*phi2*phi2*(1-phi2))
print(3*phi1*phi1*(1-phi1) + 0.4*phi2*phi2*(1-phi2))
print(5*phi1*phi1*(1-phi1) + 0.6*phi2*phi2*(1-phi2))

0.01018808515111608
0.020376170581179024
0.030564256011241964


## 官网例子

In [12]:
x = torch.ones(2, 2, requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)


In [13]:
x2 = torch.tensor([[1., 2.], [3., 1.]], requires_grad=True)
print(x2)

tensor([[1., 2.],
        [3., 1.]], requires_grad=True)


In [14]:
# y = torch.add(x, x2)
y = x + x2
print(y)
y.requires_grad_(True) #TODO why? 还是不能输出梯度, 必须构造声明 requires_grad=True

tensor([[2., 3.],
        [4., 2.]], grad_fn=<AddBackward0>)


tensor([[2., 3.],
        [4., 2.]], grad_fn=<AddBackward0>)

In [15]:
z = y * y * 3
out = z.mean()

print(z, out)

tensor([[12., 27.],
        [48., 12.]], grad_fn=<MulBackward0>) tensor(24.7500, grad_fn=<MeanBackward0>)


In [16]:
out.backward(retain_graph=True)

In [17]:
print(x.requires_grad, x.grad)

True tensor([[3.0000, 4.5000],
        [6.0000, 3.0000]])


In [18]:
print(y.requires_grad, y.grad)

True None


In [19]:
print(x2.requires_grad, x2.grad)

True tensor([[3.0000, 4.5000],
        [6.0000, 3.0000]])
