# 5.2. 参数管理

之前的介绍中，我们只依靠深度学习框架来完成训练的工作， 而忽略了操作参数的具体细节。 本节，我们将介绍以下内容：

1.访问参数，用于调试、诊断和可视化；

2.参数初始化；

3.在不同模型组件间共享参数。



In [1]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.0671],
        [0.0831]], grad_fn=<AddmmBackward>)

# 5.2.1. 参数访问

我们可以通过索引来访问模型的任意层。 就像模型是一个列表一样，每层的参数都在其属性中。 如下所示，我们可以检查第二个全连接层的参数。

In [2]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0160,  0.3417, -0.1049,  0.1672, -0.0872, -0.1021,  0.2997,  0.2706]])), ('bias', tensor([-0.2399]))])


5.2.1.1. 目标参数

In [3]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2399], requires_grad=True)
tensor([-0.2399])


In [4]:
# 参数是复合的对象，包含值、梯度和额外信息
# 还没有调用反向传播，所以参数的梯度处于初始状态
net[2].weight.grad == None

True

5.2.1.2. 一次性访问所有参数

In [5]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [6]:
net.state_dict()['2.bias'].data

tensor([-0.2399])

5.2.1.3. 从嵌套块收集参数

In [7]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.3671],
        [0.3670]], grad_fn=<AddmmBackward>)

In [8]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [9]:
# 层是分层嵌套的，所以我们也可以像通过嵌套列表索引一样访问它们。 下面，我们访问第一个主要的块中、第二个子块的第一层的偏置项。
rgnet[0][1][0].bias.data

tensor([-0.1071, -0.4278,  0.3176, -0.3143, -0.1962, -0.1607, -0.0303,  0.4511])

In [10]:
rgnet[0][1][0]

Linear(in_features=4, out_features=8, bias=True)

# 5.2.2. 参数初始化

5.2.2.1. 内置初始化

In [13]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

print(net[0].weight, net[0].bias)
net.apply(init_normal)
print(net[0].weight.data, net[0].bias.data)

Parameter containing:
tensor([[ 1.3002e-03, -7.4261e-03, -3.9017e-03, -2.2209e-03],
        [-4.4397e-03, -7.2699e-04,  9.7707e-03,  9.6246e-03],
        [ 1.3336e-02, -1.4285e-02,  1.6487e-03, -1.8382e-03],
        [ 2.3118e-02,  2.0247e-03, -1.8560e-02,  4.5433e-03],
        [-1.2876e-02, -1.6461e-03,  6.1308e-03,  1.1597e-02],
        [-1.0554e-03,  5.1843e-03,  6.9968e-03, -5.7269e-03],
        [ 2.0187e-04, -4.3939e-04, -2.7853e-05,  2.6879e-03],
        [-1.6464e-02, -1.3581e-03,  1.2469e-02,  1.4894e-03]],
       requires_grad=True) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
tensor([[-0.0076, -0.0015,  0.0110,  0.0014],
        [-0.0042,  0.0093, -0.0044, -0.0093],
        [-0.0050,  0.0184, -0.0092, -0.0119],
        [ 0.0019,  0.0196,  0.0301, -0.0161],
        [ 0.0092, -0.0116, -0.0126, -0.0128],
        [-0.0165,  0.0023,  0.0018,  0.0009],
        [-0.0007, -0.0094, -0.0005,  0.0056],
        [ 0.0088,  0.0004,  0.0048, -0.0040]]) te

In [15]:
def init_constant(m):
    """参数初始化为给定的常数"""
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [16]:
# 使用Xavier初始化方法初始化第一个神经网络层， 然后将第三个神经网络层初始化为常量值42
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.5683, -0.5353,  0.2850,  0.0621])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


5.2.2.2. 自定义初始化

In [19]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", 
              *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

In [20]:
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 9.6728, -5.5563, -8.7891,  7.5902],
        [ 0.0000,  0.0000,  6.2162, -7.4749]], grad_fn=<SliceBackward>)

In [21]:
# 始终可以直接设置参数
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -4.5563, -7.7891,  8.5902])

# 5.2.3. 参数绑定

希望在多个层间共享参数： 我们可以定义一个层，然后使用它的参数来设置另一个层的参数

In [22]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 5.2.4. 小结

我们有几种方法可以访问、初始化和绑定模型参数。

我们可以使用自定义初始化方法。

# 5.2.5. 练习

net[2].weight包含Parameter对象的信息，包括weight值，梯度等信息
net[2].weight.data是一个张量

In [25]:
print(net[2].weight)
print(net[2].weight.data)

Parameter containing:
tensor([[ 1.0000e+02, -2.0246e-01,  2.2066e-01, -6.3823e-02, -2.8728e-01,
         -1.4337e-01,  1.9797e-01,  1.7918e-01],
        [ 9.9737e-02, -2.8349e-01, -3.8362e-03,  3.3316e-01,  1.0120e-01,
          9.1710e-02, -2.2741e-01,  1.2297e-01],
        [-1.0141e-01, -9.0059e-02, -1.0805e-01,  3.0651e-01, -1.9367e-01,
         -1.4745e-01, -2.5735e-01, -9.4169e-02],
        [-2.9544e-01, -7.5752e-03, -1.9394e-01,  2.3636e-02, -1.5289e-01,
          8.7203e-02,  8.5627e-02,  3.1674e-01],
        [ 3.2139e-01, -5.0683e-02,  3.3563e-01, -5.1077e-02,  3.4657e-01,
         -1.9635e-01,  2.0411e-01,  3.3237e-01],
        [ 1.6902e-01,  2.4838e-01, -2.4319e-01, -5.0454e-02, -3.0045e-01,
         -7.0019e-02, -2.3081e-01, -3.3923e-01],
        [-1.2879e-01,  2.9676e-01, -2.2761e-01, -3.2908e-01, -5.4318e-02,
         -2.7796e-01,  9.2343e-02, -5.2415e-02],
        [-3.4568e-01,  1.8261e-01,  2.7762e-01, -4.3979e-02, -1.5263e-01,
         -1.5259e-01, -2.6490e-01, -2.0825e