In [2]:
import torch
from torch import nn
from torch.nn import init

In [16]:
### 构造网络结构
#1 通过直接继承Module构造：
class MLP(nn.Module):
    def __init__(self,**kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)
    
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

X = torch.rand(5,784)
net = MLP()
print(net)
print(net(X))

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)
tensor([[ 0.2113, -0.1902,  0.0155, -0.0361,  0.3048, -0.0380, -0.0743, -0.2366,
          0.0536, -0.0602],
        [ 0.1542, -0.1388, -0.1212, -0.0085,  0.1517, -0.1535, -0.1191, -0.2029,
          0.0537, -0.1713],
        [ 0.1749, -0.1326, -0.0131, -0.2024,  0.2266, -0.0469, -0.0444, -0.2532,
          0.0509, -0.0664],
        [ 0.1858, -0.1601,  0.0137, -0.0280,  0.2012, -0.0778, -0.0259, -0.2144,
          0.1152, -0.1440],
        [ 0.1893, -0.1607,  0.0232, -0.0368,  0.2341, -0.0705,  0.0115, -0.1640,
          0.0111, -0.0551]], grad_fn=<AddmmBackward>)


In [17]:
### 构造网络结构
#2 通过继承Module子类Sequential构造：

net = nn.Sequential(nn.Linear(784, 256) ,nn.ReLU(), nn.Linear(256, 10))
print(net)
X = torch.rand(5,784)
print(net(X))

Sequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)
tensor([[-0.0062, -0.2031, -0.2185, -0.0021,  0.1420, -0.0909,  0.0839, -0.0424,
         -0.1877, -0.1347],
        [-0.1186, -0.1491, -0.0931, -0.1109,  0.2663, -0.1916,  0.2269, -0.1162,
         -0.1705, -0.1301],
        [-0.0909, -0.1485, -0.0136, -0.0706,  0.1453, -0.1295,  0.0940,  0.0174,
         -0.1262, -0.0808],
        [ 0.0209, -0.1949, -0.1067,  0.0553,  0.1143, -0.0746,  0.0643, -0.1399,
          0.0336, -0.0140],
        [-0.0570, -0.1424, -0.1451, -0.1573,  0.0951, -0.1820,  0.0922, -0.0609,
         -0.1194, -0.1428]], grad_fn=<AddmmBackward>)


In [18]:
### 构造网络结构
#3 通过继承Module子类ModuleDict构造：
# 作为字典操作

net = nn.ModuleDict({'hidden':nn.Linear(784, 256),
                    'act':nn.ReLU(),
                    })
net['output'] = nn.Linear(256, 10)
print(net)
print(net['act'])
print(net.output)

ModuleDict(
  (act): ReLU()
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)
ReLU()
Linear(in_features=256, out_features=10, bias=True)


In [19]:
### 构造网络结构
#4 通过继承Module子类ModuleList构造：
# 作为列表，可使用append、extend等操作

net = nn.ModuleList([nn.Linear(784, 256), nn.ReLU()])
net.append(nn.Linear(256, 10))
print(net)
print(net[-1])

ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)
Linear(in_features=256, out_features=10, bias=True)


In [40]:
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))
# pytorch已默认进行初始化

print(net)
X = torch.rand(2, 4)
Y = net(X).sum()

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


In [41]:
print(X,Y)

tensor([[0.6344, 0.9445, 0.7919, 0.0786],
        [0.1679, 0.9494, 0.2991, 0.1006]]) tensor(0.6875, grad_fn=<SumBackward0>)


In [42]:
# 通过继承Module类中paramters()或named_paramters()方法，来访问网络中所有参数（以迭代器形式返回）。后者除了返回参数tensor还会返回名字
print(type(net.named_parameters()))

for name, param in net.named_parameters():
    print(name, param.size())

# param是一个tensor，data(),size(),grad()
# 可见返回的名字前自动带了层数前缀

<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [43]:
# 访问单层参数
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

# 因为是单层访问，所以名字前方没有数字前缀

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


In [44]:
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
        
    def forward(self,x):
        pass

net0 = MyModel()
print(net0)
for name, param in net0.named_parameters():
    print(name)
    
# weight1使用了nn.Parameter(),在参数列表中
# weight2不在参数列表中

MyModel()
weight1


In [45]:
# Parameter本质是tensor,tensor有的属性它都有，例如可以通过data()访问参数数值，通过grad()访问参数梯度
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad)
# 反向传播前梯度为None

Y.backward()
print(weight_0.grad)

tensor([[-0.4269, -0.2976, -0.4891, -0.3140],
        [-0.0497,  0.4347, -0.2373, -0.1218],
        [ 0.4373, -0.2428, -0.3731, -0.1160]])
None
tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.2486, 0.5868, 0.3381, 0.0556],
        [0.0000, 0.0000, 0.0000, 0.0000]])


In [46]:
### 初始化模型参数
# 使用pytorch的init中自带的多种预设的初始化方法来初始化权重（torch.nn.init.normal_())
for name, param in net.named_parameters():
    if 'weight' in name:
        # 对权重进行随机正态分布
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[-0.0045,  0.0029,  0.0183,  0.0001],
        [ 0.0018, -0.0155, -0.0085,  0.0012],
        [-0.0115,  0.0060, -0.0210,  0.0093]])
2.weight tensor([[ 0.0022,  0.0082, -0.0178]])


In [47]:
# 使用常数进行初始化(torch.nn.init.constant_())
for name, param in net.named_parameters():
    if 'bias' in name:
        # 对偏置进行常数初始化
        init.constant_(param, val=0)
        print(name, param.data)

0.bias tensor([0., 0., 0.])
2.bias tensor([0.])


In [48]:
for name, param in net.named_parameters():
    print(name,param)

0.weight Parameter containing:
tensor([[-0.0045,  0.0029,  0.0183,  0.0001],
        [ 0.0018, -0.0155, -0.0085,  0.0012],
        [-0.0115,  0.0060, -0.0210,  0.0093]], requires_grad=True)
0.bias Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
2.weight Parameter containing:
tensor([[ 0.0022,  0.0082, -0.0178]], requires_grad=True)
2.bias Parameter containing:
tensor([0.], requires_grad=True)


In [51]:
# 使用自定义的方法初始化权重。
# 参考init中预定义的初始化方法，可以看到就是一个改变tensor数值的函数，且这个过程中不记录梯度。

def init_weight_(tensor):
    # 初始化过程中不记录梯度
    with torch.no_grad():
        # uniform_(x, y)表示在[x, y]中随机抽样
        tensor.uniform_(-10, 10)
        # 表示有一半的概率为0，另一半的概率初始化为[-10，-5],[5, 10]之间均匀分布的随机数
        tensor *= (tensor.abs() >= 5).float()
        
for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[ 5.5880, -7.0385,  0.0000, -6.9496],
        [ 0.0000, -7.9420,  6.6985,  6.5157],
        [-6.0789, -0.0000,  0.0000, -0.0000]])
2.weight tensor([[ 0.0000, -9.4091, -9.9957]])


In [55]:
# 改变bias值（param.data），同时不影响梯度

for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, param.data)

0.bias tensor([1., 1., 1.])
2.bias tensor([1.])


In [59]:
# 共享模型参数
# 如果传入Sequential的是同一个Module实例，则他们的参数共享
linear = nn.Linear(3, 3, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=2)
    print(name, param.size(), param.data)
    

# 网络的两层对应同一个对象
print(id(net[0]) == id(net[1]))

Sequential(
  (0): Linear(in_features=3, out_features=3, bias=False)
  (1): Linear(in_features=3, out_features=3, bias=False)
)
0.weight torch.Size([3, 3]) tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]])
True


In [63]:
X = torch.ones(5, 3)
y = net(X).sum()
print(X,y)

# 因为是同一个对象，所以共享的参数梯度是累加的
y.backward()
print(net[0].weight.grad)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]]) tensor(540., grad_fn=<SumBackward0>)
tensor([[60., 60., 60.],
        [60., 60., 60.],
        [60., 60., 60.]])


In [74]:
# 自定义层
# 定义一个不含模型参数的自定义层

class CentreLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CentreLayer,self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()
    
layer = CentreLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

In [70]:
net = nn.Sequential(nn.Linear(8, 128), CentreLayer())

y = net(torch.rand(4, 8))

y.mean().item()

-5.3551048040390015e-09

In [92]:
# 定义一个含模型参数的自定义层（参数应该定义成nn.Parameter类型，可以自动被识别成模型参数）
# 参数可使用ParameterList.(append,extend)

class MyListDense(nn.Module):
    def __init__(self, **kwargs):
        super(MyListDense, self).__init__(**kwargs)
        # ParameterList是一个列表，每个元素均为nn.Parameter类型
        self.params = nn.ParameterList(
            [nn.Parameter(torch.randn(4,4)) for i in range(3)]
        )
        self.params.append(nn.Parameter(torch.ones(4,4)))
        
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x

net = MyListDense()
print(net)

MyListDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x4]
  )
)


In [93]:
# 定义一个含模型参数的自定义层（参数应该定义成nn.Parameter类型，可以自动被识别成模型参数）
# 参数可使用ParameterDict.(update,keys)

class MyDictDense(nn.Module):
    def __init__(self, **kwargs):
        super(MyDictDense, self).__init__(**kwargs)
        self.params = nn.ParameterDict({
            'weight0':nn.Parameter(torch.randn(4, 4)),
            'weight1':nn.Parameter(torch.randn(4, 1))                    
        })
        self.params.update({'weight2':nn.Parameter(torch.randn(4, 2))})
        
    def forward(self, x, choice= 'weight0'):
        return torch.mm(x, self.params[choice])
    
net = MyDictDense()
print(net)

x = torch.ones(1,4)
print(net(x, 'weight0'))
print(net(x, 'weight1'))
print(net(x, 'weight2'))

MyDictDense(
  (params): ParameterDict(
      (weight0): Parameter containing: [torch.FloatTensor of size 4x4]
      (weight1): Parameter containing: [torch.FloatTensor of size 4x1]
      (weight2): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)
tensor([[0.3357, 2.3771, 2.1859, 0.7698]], grad_fn=<MmBackward>)
tensor([[-3.5406]], grad_fn=<MmBackward>)
tensor([[-0.6335,  0.3732]], grad_fn=<MmBackward>)


In [94]:
net = nn.Sequential(MyListDense(), MyDenseDict())
print(net)
print(net(x))

Sequential(
  (0): MyListDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x4]
    )
  )
  (1): MyDenseDict(
    (params): ParameterDict(
        (weight0): Parameter containing: [torch.FloatTensor of size 4x4]
        (weight1): Parameter containing: [torch.FloatTensor of size 4x1]
        (weight2): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
)
tensor([[-4.8454]], grad_fn=<MmBackward>)


In [95]:
# 读取和存储
# 使用save和load读取和保存tensor

x = torch.ones(3)
torch.save(x, 'x.pt')

In [97]:
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [99]:
# 保存一个tensor列表

y = torch.zeros(4)
torch.save([x,y],'xy_list.pt')
xy = torch.load('xy_list.pt')
xy

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [100]:
# 保存一个tensor字典

torch.save({'x':x,'y':y},'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

In [106]:
# 读写模型

class MLP(nn.Module):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(3,5)
        self.act = nn.ReLU()
        self.output = nn.Linear(5,2)
        
    def forward(self ,x):
        return output(act(x))
    
net = MLP()
net.state_dict()

# state_dict()返回一个从参数名称映射到参数tensor的字典，只有可学习的对象才会出现在字典中

OrderedDict([('hidden.weight',
              tensor([[ 0.3969,  0.3525, -0.1302],
                      [-0.5180, -0.4990, -0.3561],
                      [ 0.0611,  0.4282,  0.1527],
                      [ 0.3883,  0.3240, -0.0924],
                      [-0.2087, -0.4826, -0.2822]])),
             ('hidden.bias',
              tensor([-0.0319, -0.4559,  0.2143,  0.4169, -0.5517])),
             ('output.weight',
              tensor([[ 0.1568, -0.0559, -0.2843, -0.2305,  0.3241],
                      [-0.2311, -0.1352, -0.0693,  0.0449,  0.4031]])),
             ('output.bias', tensor([ 0.2086, -0.3661]))])

In [109]:
optimizer = torch.optim.SGD(net.parameters(),lr=0.001, momentum=0.9)
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [0, 1, 2, 3]}]}

In [None]:
# 保存和加载模型

# 保存
# torch.save(model.state_dict(), PATH)

# 加载
# model = themodelclass(*args, **kwargs)
# model.load_state_dict(torch.load(PATH))

In [None]:
# GPU计算

# .cuda()可以将CPU上的tensor转换到GPU上

# 指定设备
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')