In [1]:
import torch 
from torch import nn
net  = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))
X = torch.rand(size = (2,4))
net(X)

tensor([[-0.2413],
        [-0.1924]], grad_fn=<AddmmBackward>)

In [2]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.2590,  0.3003, -0.0662,  0.1349, -0.1195, -0.2966, -0.2999, -0.1499]])), ('bias', tensor([-0.1289]))])


In [3]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].weight.grad)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.1289], requires_grad=True)
None


In [4]:
print([(name,param.shape) for name,param in net[0].named_parameters()])
print([(name,param.shape) for name,param in net.named_parameters()])

[('weight', torch.Size([8, 4])), ('bias', torch.Size([8]))]
[('0.weight', torch.Size([8, 4])), ('0.bias', torch.Size([8])), ('2.weight', torch.Size([1, 8])), ('2.bias', torch.Size([1]))]


In [5]:
net.state_dict()['0.bias'].data

tensor([-0.3947,  0.2341,  0.4563, -0.4981, -0.1716, -0.3031, -0.2057, -0.4177])

嵌套使用各种层

In [6]:
def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),
                         nn.Linear(8,4),nn.ReLU())
def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}',block1())
    return net
rgnet = nn.Sequential(block2(),nn.Linear(4,1))
rgnet(X)

tensor([[0.0578],
        [0.0578]], grad_fn=<AddmmBackward>)

In [7]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


内置初始化 8：00

In [10]:
def init_normal(m):
    if type(m)  == nn.Linear: #如果是nn.Linear类型的直接进行一个初始化
        nn.init.normal_(m.weight,mean= 0,std = 0.01)#下划线划在后面，表示是一个替换函数，而不是返回一个值（inplace）
        nn.init.zeros_(m.bias)
        
net.apply(init_normal)#对net里面的所有module，遍历一遍，全都apply
net[0].weight.data[0],net[0].bias.data[0]

(tensor([ 0.0072, -0.0053,  0.0075, -0.0086]), tensor(0.))

In [11]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,1) #我们不能这么做，全部初始化一样的话，梯度等都会一样，这些神经元等效成一个权重为1*N的单个神经元了
        nn.init.zeros_(m.bias)
        
net.apply(init_constant)
net[0].weight.data[0],net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [14]:
def xavier(m):#为了信息更好的流动，每一层输出的方差应当尽量相等
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        
net[0].apply(xavier)
print(net[0].weight.data)

tensor([[-0.5640,  0.4117,  0.3097, -0.2064],
        [-0.6772,  0.5243,  0.5349,  0.2458],
        [ 0.6409, -0.4387,  0.2447, -0.0498],
        [ 0.1888,  0.4739, -0.5790,  0.5265],
        [-0.4249, -0.2298, -0.5855,  0.3808],
        [ 0.0414, -0.4393, -0.5209, -0.0644],
        [-0.0896,  0.1875,  0.3477, -0.2308],
        [ 0.0336,  0.2513, -0.5507, -0.2673]])


自定义初始化

In [20]:
def my_init(m):
    if type(m)== nn.Linear:
        print("INIT",
              [(name,param.shape) for name,param in m.named_parameters()][0])
        nn.init.uniform_(m.weight,-10.10)
        m.weight.data *= m.weight.data.abs() >= 5
        
net.apply(my_init)
print(net[0].weight[:2])

INIT ('weight', torch.Size([8, 4]))
INIT ('weight', torch.Size([1, 8]))
tensor([[-8.2505, -9.9846, -0.0000, -9.8021],
        [-0.0000, -6.9165, -9.9538,  0.0000]], grad_fn=<SliceBackward>)


In [23]:
net[0].weight.data[0,0]=42
net[0].weight.data[0,0]

tensor(10.)

参数绑定

In [24]:
shared = nn.Linear(8,8)
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),shared,nn.ReLU(),shared,nn.ReLU(),nn.Linear(8,1))
net(X)
print(net[2].weight.data[0]== net[4].weight.data[0])



tensor([True, True, True, True, True, True, True, True])
