In [45]:
import logging
from copy import copy

from mxnet import autograd, nd


# 自动求梯度


在深度学习中，我们经常需要对函数求梯度。mxnet提供的autograd模块，可用于自动求解梯度。

以求解函数 $$ y = 2x^Tx $$ 的梯度为例

In [46]:

# 随机生成变量x
x = nd.random.randint(-4, 5, shape=(4, 1)).astype(float)
print(x)


# 调用attach_grad函数，申请存储梯度所需要的内存。
print(x.grad)
x.attach_grad()

# 初始化梯度为0
print(x.grad)



[[ 2.]
 [-4.]
 [-2.]
 [ 3.]]
<NDArray 4x1 @cpu(0)>
None

[[0.]
 [0.]
 [0.]
 [0.]]
<NDArray 4x1 @cpu(0)>


In [47]:

# 默认条件下mxnet不会记录用于求梯度的计算，需要调用record函数来要求mxnet记录与梯度有关的计算
with autograd.record():
    y = 2 * nd.dot(x.T, x)

y.backward()
print(x.grad)


# 如果没有调用record函数，
x_new = copy(x)
y_new = 2 * nd.dot(x_new.T, x_new)

try:
    y_new.backward()
except Exception as e:
    logging.error('未调用record函数，求梯度报错')


ERROR:root:未调用record函数，求梯度报错



[[  8.]
 [-16.]
 [ -8.]
 [ 12.]]
<NDArray 4x1 @cpu(0)>


In [49]:

# 在调用record函数后，mxnet不仅会记录并计算梯度，还会将运行模式从预测模式转为训练模式
print(autograd.is_training())
with autograd.record():
    print(autograd.is_training())


False
True




### 对python控制流求梯度


In [48]:

# 自定义函数 f
def f(a):

    b = a * 2
    while b.norm().asscalar() < 1000:
        b = b * 2

    if b.sum().asscalar() > 0:
        return b
    else:
        return 100 * b

# 求自定义函数 f 的梯度
n = 1
for i in range(n):
    x = nd.random.normal(shape=(1, 1)).astype(float)
    print(x)

    x.attach_grad()
    with autograd.record():
        y = f(x)
    print(y)
    y.backward()

    print(x.grad)
    print(x.grad == y/x)



[[0.75617343]]
<NDArray 1x1 @cpu(0)>

[[1548.64318848]]
<NDArray 1x1 @cpu(0)>

[[2048.]]
<NDArray 1x1 @cpu(0)>

[[1.]]
<NDArray 1x1 @cpu(0)>



### 头梯度

z = f(y), y = g(x), 则有 dz/dx = dz/dy * dy/dx，此时可以先计算函数f的梯度，再在g的基础上乘上f的梯度。
此时引入头梯度,头梯度相当于在函数梯度前提供一个需要乘上的系数

In [50]:

# 没有头梯度
x = nd.array([[1, 2], [3, 4]]).astype(float)
x.attach_grad()
with autograd.record():
    y = x * x
    z = y * x * x
z.backward()
print(x.grad)


# 头梯度全为1
x = nd.array([[1, 2], [3, 4]]).astype(float)
x.attach_grad()
with autograd.record():
    y = x * x
    z = y * x * x

head_gradient = nd.array([[1, 1], [1, 1]]).astype(float)
z.backward(head_gradient)
print(x.grad)


# 设置头梯度
x = nd.array([[1, 2], [3, 4]]).astype(float)
x.attach_grad()
with autograd.record():
    y = x * x
    z = y * x * x

head_gradient = nd.array([[10, 1], [0.1, 0.01]]).astype(float)
z.backward(head_gradient)
print(x.grad)



[[  4.  32.]
 [108. 256.]]
<NDArray 2x2 @cpu(0)>

[[  4.  32.]
 [108. 256.]]
<NDArray 2x2 @cpu(0)>

[[40.         32.        ]
 [10.80000016  2.55999994]]
<NDArray 2x2 @cpu(0)>
