# 模型

## 继承Block类构造模型

In [4]:
from mxnet import nd 
from mxnet.gluon import nn

In [5]:
class MLP(nn.Block):
    # 申明模型的层
    def __init__(self,**kwargs):
        #调用父类构造函数进行必要的初始化
        super(MLP,self).__init__(**kwargs)
        # 隐含层
        self.hidden = nn.Dense(256, activation='relu')
        # 输出层
        self.output = nn.Dense(10)

    # 定义模型的输出，即根据输入x计算返回的所需的模型输出
    def forward(self, x):
        return self.output(self.hidden(x))

In [9]:
net = MLP()
net.initialize()

In [10]:
X = nd.random.uniform(shape=(2,20))
net(X)



[[ 0.02394443  0.05150647  0.01035163 -0.06435341 -0.05801985  0.064192
   0.04472664 -0.01852541 -0.03237379  0.07389369]
 [ 0.05207362  0.04186264  0.04021508 -0.06558423 -0.02249499  0.0341314
   0.02135914 -0.06898528  0.02329672  0.0033668 ]]
<NDArray 2x10 @cpu(0)>

## Sequential类继承自Block类

In [18]:
class MySequential(nn.Block):
    def __init__(self, **kwargs):
        super(MySequential, self).__init__(**kwargs)

    def add(self, block):
        # bloc是Bloc子类的一个实例，将其保存在_children中，
        # 其类型时OrderedDict
        # 调用initialize时会自动初始化其中所有成员
        self._children[block.name] = block 

    def forward(self, x):
        # OrderedDict会保证按照添加时候的顺序遍历
        for block in self._children.values():
            x = block(x)
        return x

In [19]:
net = MySequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))

net.initialize()
net(X)


[[-0.03358278  0.00098312  0.03334405 -0.00663612  0.07881726 -0.01704565
  -0.01302506 -0.05449733  0.04149391  0.00170795]
 [ 0.01879605 -0.04185785  0.02918838 -0.00970372  0.05835275 -0.031299
  -0.00644606 -0.02542868  0.0442826   0.01446365]]
<NDArray 2x10 @cpu(0)>

## 构造复杂的模型

In [35]:
class FancyMLP(nn.Block):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.rand_weight = self.params.get_constant(
            'rand_weight', nd.random.uniform(shape=(20, 20)))
        self.Dense = nn.Dense(20, activation='relu')

    def forward(self, x):
        x = self.Dense(x)
        x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
        x = self.Dense(x)
        while x.norm().asscalar() > 1:
            x /= 2
        if x.norm().asscalar() > 0.8:
            x *= 10
        return x.sum()


In [36]:
net = FancyMLP()
net.initialize()
net(X)


[37.89107]
<NDArray 1 @cpu(0)>

In [39]:
class NestMLP(nn.Block):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential()
        self.net.add(nn.Dense(64, activation='relu'), 
                        nn.Dense(32, activation='relu'))
        self.Dense = nn.Dense(16, activation='relu')

    def forward(self, x):
        return self.Dense(self.net(x))

In [41]:
net = nn.Sequential()
net.add(NestMLP(), nn.Dense(20), FancyMLP())
net.initialize()
net(X)


[1.4732156]
<NDArray 1 @cpu(0)>

## 模型参数的共享和访问 

In [44]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))

net.initialize()

X = nd.random.uniform(shape=(2, 20))
Y = net(X)

In [47]:
net[0].params,type(net[0].params)

(dense30_ (
   Parameter dense30_weight (shape=(256, 20), dtype=float32)
   Parameter dense30_bias (shape=(256,), dtype=float32)
 ), mxnet.gluon.parameter.ParameterDict)

In [48]:
# 可以是使用名字访问字典里的元素，也可以使用变量名如weight, bias
net[0].params['dense30_weight'],net[0].weight

(Parameter dense30_weight (shape=(256, 20), dtype=float32),
 Parameter dense30_weight (shape=(256, 20), dtype=float32))

In [50]:
# Gluon使用parameter类，它包含参数值和梯度，可以使用data，grad函数访问
net[0].weight.data()[:1]


[[-0.05052982  0.06649857 -0.01023339 -0.02788465  0.04799969 -0.05894421
   0.04452466  0.03176292 -0.05566207  0.06655938 -0.04810633  0.03949251
  -0.02741218 -0.03541367 -0.05944973 -0.06488146 -0.01054718 -0.01303093
  -0.05493352  0.05288512]]
<NDArray 1x20 @cpu(0)>

In [53]:
# z因为没有进行梯度的计算，所有梯度为0
net[0].weight.grad()[0], net[1].bias.data()[0]

(
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 <NDArray 20 @cpu(0)>, 
 [0.]
 <NDArray 1 @cpu(0)>)

In [54]:
# 可以使用collect_params获取经过嵌套的层的所有参数
net.collect_params()

sequential6_ (
  Parameter dense30_weight (shape=(256, 20), dtype=float32)
  Parameter dense30_bias (shape=(256,), dtype=float32)
  Parameter dense31_weight (shape=(10, 256), dtype=float32)
  Parameter dense31_bias (shape=(10,), dtype=float32)
)

## 初始化模型参数

In [55]:
from mxnet import init


In [56]:
# init中有多种初始化权值的方法
# 非首次对模型初始化需指定force_reinit为真
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)

In [57]:
# 使用常数初始化
net.initialize(init=init.Constant(1), force_reinit=True)

In [59]:
net[0].weight.data()[1]


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 20 @cpu(0)>

In [60]:
# 对特定参数进行初始化，可以使用parameter类的初始化函数,与Block类相似
net[0].weight.initialize(init= init.Normal(sigma=0.02), force_reinit=True)

In [61]:
net[0].weight.data()[1]


[-0.01331632 -0.01754761 -0.00382996  0.00997399  0.03486121 -0.01873092
 -0.00319623 -0.03353032  0.0006802  -0.02281556 -0.03817548  0.02163221
  0.02128435 -0.00629699  0.04356062  0.01866355 -0.00596921 -0.0097684
 -0.02303199  0.02577024]
<NDArray 20 @cpu(0)>

## 自定义初始化方法

In [66]:
class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
        data *= data.abs() >=5
    
net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]

Init dense30_weight (256, 20)
Init dense31_weight (10, 256)



[-9.508459  -0.         0.        -6.8121257 -5.388574   0.
 -0.        -0.         6.0051117  8.644983   9.111368   5.773219
 -0.         9.755169   6.536106   7.747261  -7.920183  -9.8937
  0.        -8.793293 ]
<NDArray 20 @cpu(0)>

In [67]:
# 可以使用parameter类中set_data直接改写模型参数

net[0].weight.set_data(net[0].weight.data()+1)
net[0].weight.data()[0]


[-8.508459   1.         1.        -5.8121257 -4.388574   1.
  1.         1.         7.0051117  9.644983  10.111368   6.773219
  1.        10.755169   7.536106   8.747261  -6.920183  -8.8937
  1.        -7.793293 ]
<NDArray 20 @cpu(0)>

## 共享参数

In [68]:
# 除了在定义模型中多次调用同一层外，可以也可以添加层时进行指定
# 下面定义二三层共享参数和梯度，二三层梯度会累加到shared.params.grad()中

net = nn.Sequential()

shared = nn.Dense(8, activation='relu')

net.add(nn.Dense(8,activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()

net(X)


[[-6.1061524e-05 -2.4055375e-04  5.6250559e-05 -4.4983943e-05
  -1.3011644e-04 -1.2308279e-05  2.3051508e-04  3.1897693e-04
  -1.9086878e-04  1.8000347e-04]
 [-3.1810279e-05 -1.5045062e-04  1.6630591e-05 -4.1270498e-05
  -1.0696284e-04  5.4987431e-06  1.3510580e-04  2.2468806e-04
  -1.4344636e-04  1.2240042e-04]]
<NDArray 2x10 @cpu(0)>

In [70]:
net[1].weight.data()[0] == net[2].weight.data()[0]


[1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 8 @cpu(0)>

## 延后初始化

In [72]:
# 模型定义后，因为没有给定输入的形状，就算指定了初始化的方法，也无法进行初始化，
#但初始化一定会在前向传播前进行，这称为延后初始化

#可以在定义模型时，指定每层的输入,nn.Dense(256,in_units=20),避免延后初始化
#因为在延后初始化中，set_data，data函数无法使用