# 2. Managing Parameters

In [1]:
import torch
from torch import nn

In [2]:
X = torch.rand(size=(2, 4))
X

tensor([[0.5152, 0.1129, 0.3631, 0.8286],
        [0.5405, 0.7418, 0.6749, 0.4462]])

In [3]:
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
net(X)

tensor([[-0.1304],
        [-0.0874]], grad_fn=<AddmmBackward0>)

## Accessing Parameters

In [10]:
for layer in net:
    print(layer)
    print(layer.state_dict())
    print()

Linear(in_features=4, out_features=8, bias=True)
OrderedDict([('weight', tensor([[ 0.0697, -0.2755,  0.1505, -0.3063],
        [ 0.1801, -0.4924,  0.3837, -0.1068],
        [ 0.4164,  0.1121,  0.2296,  0.0915],
        [ 0.3140, -0.2842, -0.0678, -0.4969],
        [ 0.4675,  0.1169, -0.0479, -0.1169],
        [-0.3475,  0.0701, -0.2368, -0.1622],
        [-0.1049, -0.2786, -0.4635,  0.3972],
        [ 0.1103,  0.3707, -0.3777, -0.2407]])), ('bias', tensor([ 0.2606, -0.1939, -0.3765, -0.3199,  0.1526,  0.2969,  0.1175, -0.4235]))])

ReLU()
OrderedDict()

Linear(in_features=8, out_features=1, bias=True)
OrderedDict([('weight', tensor([[ 0.2223, -0.2449,  0.3347, -0.2847,  0.0135,  0.2245, -0.0192, -0.2596]])), ('bias', tensor([-0.1487]))])



Instead of using a loop, we can also access all the parameters at once:

In [11]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [12]:
net.state_dict()['2.bias'].data

tensor([-0.1487])

Now, let's try accessing the parameters of a nested network:

In [13]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.3251],
        [0.3251]], grad_fn=<AddmmBackward0>)

In [14]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [15]:
rgnet[0][1][0].bias.data

tensor([-0.0446, -0.2349,  0.2949,  0.4968, -0.2533,  0.0126,  0.3250,  0.2250])

## Initializing Parameters

In [16]:
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))

Normal distribution:

In [20]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
net.apply(init_normal)
net[0].weight.data, net[0].bias.data

(tensor([[ 0.0097, -0.0193, -0.0048,  0.0005],
         [-0.0050, -0.0021,  0.0120, -0.0035],
         [-0.0025, -0.0114, -0.0039, -0.0009],
         [ 0.0005,  0.0249, -0.0045,  0.0051],
         [-0.0066, -0.0049, -0.0096, -0.0142],
         [ 0.0057,  0.0095,  0.0054,  0.0022],
         [-0.0019,  0.0059,  0.0093,  0.0015],
         [ 0.0132,  0.0021, -0.0022, -0.0070]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

Constants:

In [22]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
        
net.apply(init_constant)
net[0].weight.data, net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

Xavier intialization:

In [27]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)

print(net[0].weight.data)
print(net[2].weight.data)

tensor([[-0.6796, -0.0864,  0.3009, -0.1210],
        [-0.6444,  0.1863, -0.1993, -0.2640],
        [-0.1704,  0.0114, -0.2072, -0.0203],
        [ 0.6200, -0.6317, -0.6331, -0.2132],
        [ 0.4434,  0.6231,  0.6195, -0.5510],
        [-0.4974,  0.0163, -0.0509,  0.0091],
        [ 0.5849, -0.1637,  0.5056,  0.4081],
        [ 0.1190, -0.1575,  0.0644, -0.1436]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


We can also define our own initialization:

$$
\begin{aligned}
    w \sim \begin{cases}
        U(5, 10) & \text{ P = } \frac{1}{4} \\
            0    & \text{ P = } \frac{1}{2} \\
        U(-10, -5) & \text{ P = } \frac{1}{4}
    \end{cases}
\end{aligned}
$$

In [31]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Initialize", *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight.data

Initialize weight torch.Size([8, 4])
Initialize weight torch.Size([1, 8])


tensor([[ 6.8033, -9.0807, -8.0569, -0.0000],
        [-0.0000, -6.7799,  0.0000,  8.3636],
        [-0.0000, -8.7956,  0.0000,  7.7205],
        [ 0.0000, -0.0000,  0.0000,  5.9029],
        [-0.0000,  0.0000, -5.8448, -9.8805],
        [-0.0000,  7.1922,  6.3481, -5.5951],
        [-5.4699,  0.0000, -0.0000,  9.9472],
        [ 0.0000,  6.2657, -8.1505, -0.0000]])

## Sharing Parameters

When we have a shared layer, changing the parameters of one will affect all the others. Therefore, in back propagation the gradients of shared layers will cumulate.

In [32]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))

net(X)

print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## Defer Initialization

In [35]:
X = torch.rand(2, 20)
X

tensor([[0.6509, 0.2064, 0.7259, 0.4883, 0.2528, 0.9221, 0.4371, 0.0616, 0.1436,
         0.6144, 0.0145, 0.4388, 0.3766, 0.2007, 0.8130, 0.7557, 0.7991, 0.9632,
         0.0036, 0.5691],
        [0.3701, 0.0513, 0.2500, 0.5587, 0.4795, 0.5197, 0.3813, 0.0145, 0.8755,
         0.1526, 0.7328, 0.1390, 0.4890, 0.4681, 0.6764, 0.2618, 0.3439, 0.3748,
         0.3599, 0.5357]])

In [36]:
net = nn.Sequential(nn.LazyLinear(256), 
                    nn.ReLU(), 
                    nn.LazyLinear(10))

print(net)

net(X)
print(net)

Sequential(
  (0): LazyLinear(in_features=0, out_features=256, bias=True)
  (1): ReLU()
  (2): LazyLinear(in_features=0, out_features=10, bias=True)
)
Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)
