# Gluon基础教程

## 网络的创建、初始化、保存与加载 

In [2]:
from mxnet import nd
from mxnet.gluon import nn

In [11]:
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(256, activation="relu"))
    net.add(nn.Dense(10))
print(net)

Sequential(
  (0): Dense(256, Activation(relu))
  (1): Dense(10, linear)
)


## 如何理解nn.Block和nn.Sequential

事实上，nn.Sequential是nn.Block的简单形式。我们先来看下如何使用nn.Block来实现同样的网络。

In [56]:
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense0 = nn.Dense(256)
            self.dense1 = nn.Dense(10)

    def forward(self, x):
        return self.dense1(nd.relu(self.dense0(x)))

一个nn.Block就是一个计算模块，至少包含两个函数
- `__init__`：创建参数，没有显式的定义参数，而是把通过定义dense0和dense1来定义了参数
- `forward()`：定义网络的计算

在gluon里，nn.Block是一个一般化的部件。整个神经网络可以是一个nn.Block，单个层也是一个nn.Block。我们可以（近似）无限地嵌套nn.Block来构建新的nn.Block。

nn.Block主要提供这个东西

- 存储参数
- 描述forward如何执行
- 自动求导

nn.Sequential是一个nn.Block容器，它通过add来添加nn.Block。它自动生成forward()函数，其就是把加进来的nn.Block逐一运行。

一个简单的实现是这样的：

In [57]:
class Sequential(nn.Block):
    def __init__(self, **kwargs):
        super(Sequential, self).__init__(**kwargs)
    def add(self, block):
        self._children.append(block)
    def forward(self, x):
        for block in self._children:
            x = block(x)
        return x

## 模型参数

In [12]:
net.initialize()
x = nd.random_normal(shape=(2,32))
net(x)


[[-0.03452701  0.00054905 -0.01634025  0.15740812 -0.00887376  0.05347699
  -0.05174917 -0.04148688  0.06176401 -0.0059028 ]
 [-0.05516256 -0.08472089  0.01713695  0.1140467  -0.02684359 -0.12606426
  -0.0157811   0.11626006 -0.08061735  0.04699893]]
<NDArray 2x10 @cpu(0)>

In [22]:
print(net[0].name) # layer的名字
w = net[0].weight # w和b都是Parameter类型
b = net[0].bias
w.data(),w.grad() # 访问值和梯度

sequential2_dense0


(
 [[-0.01599531 -0.05768581  0.04749851 ...,  0.00408183 -0.05727024
    0.04672503]
  [-0.03006572 -0.02715722 -0.03812539 ...,  0.01958384  0.06715145
    0.02174592]
  [-0.05624504  0.03595566  0.04757827 ...,  0.05127243 -0.00077434
   -0.04942027]
  ..., 
  [ 0.06597313  0.05342786 -0.03824011 ..., -0.04555389  0.05498707
   -0.04340196]
  [-0.05670377  0.05629475  0.00370745 ...,  0.03403596  0.06027343
    0.00654084]
  [ 0.01427387  0.01173636 -0.02694394 ...,  0.01538847  0.00350107
   -0.04435379]]
 <NDArray 256x32 @cpu(0)>, 
 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]
 <NDArray 256x32 @cpu(0)>)

In [23]:
print(net.collect_params())

sequential2_ (
  Parameter sequential2_dense0_weight (shape=(256, 32), dtype=<class 'numpy.float32'>)
  Parameter sequential2_dense0_bias (shape=(256,), dtype=<class 'numpy.float32'>)
  Parameter sequential2_dense1_weight (shape=(10, 256), dtype=<class 'numpy.float32'>)
  Parameter sequential2_dense1_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)


## 网络参数的保存与加载

In [24]:
filename = "mlp.params"
net.save_params(filename)

In [27]:
import mxnet as mx
net.load_params(filename, mx.cpu())

In [29]:
# 保存 ndarray, ndarray list 以及 ndarray dict
x = nd.ones(3)
y = nd.zeros(4)
nd.save(filename, x)
nd.save(filename, [x,y])
nd.save(filename, {'x':x, 'y':y})

In [30]:
print(nd.load(filename))

{'x': 
[ 1.  1.  1.]
<NDArray 3 @cpu(0)>, 'y': 
[ 0.  0.  0.  0.]
<NDArray 4 @cpu(0)>}


## 自定义gluon中的层

In [31]:
class CenteredLayer(nn.Block):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

In [32]:
# 使用刚才定义的CenteredLayer
net = nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(128))
    net.add(nn.Dense(10))
    net.add(CenteredLayer())
print(net)

Sequential(
  (0): Dense(128, linear)
  (1): Dense(10, linear)
  (2): CenteredLayer(
  
  )
)


In [33]:
net.initialize()
y = net(nd.random.uniform(shape=(4,8)))
y.mean()


[ -3.25962896e-10]
<NDArray 1 @cpu(0)>

In [34]:
from mxnet import gluon

In [35]:
my_param = gluon.Parameter("my_params", shape=(3,3))

In [37]:
my_param.initialize()
my_param.data(),my_param.grad()

(
 [[-0.01241264 -0.06773117  0.06439362]
  [-0.04160211  0.06035588 -0.04256937]
  [-0.05521125 -0.01001626  0.03380133]]
 <NDArray 3x3 @cpu(0)>, 
 [[ 0.  0.  0.]
  [ 0.  0.  0.]
  [ 0.  0.  0.]]
 <NDArray 3x3 @cpu(0)>)

## 有参数的自定义层

In [43]:
class MyDense(nn.Block):
    def __init__(self, units, in_units, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        with self.name_scope():
            self.weight = self.params.get('weight', shape=(in_units, units))
            self.bias = self.params.get('bias',shape=(units,))
        
    def forward(self, x):
        linear = nd.dot(x, self.weight.data()) + self.bias.data()
        return nd.relu(linear)

In [44]:
dense = MyDense(5, in_units=10, prefix='o_my_dense_')
dense.params

o_my_dense_ (
  Parameter o_my_dense_weight (shape=(10, 5), dtype=<class 'numpy.float32'>)
  Parameter o_my_dense_bias (shape=(5,), dtype=<class 'numpy.float32'>)
)

In [45]:
dense.initialize()
dense(nd.random.uniform(shape=(2,10)))


[[ 0.17594399  0.06958125  0.          0.          0.10372791]
 [ 0.20519701  0.          0.0898647   0.02732412  0.13050345]]
<NDArray 2x5 @cpu(0)>

In [46]:
net = nn.Sequential()
with net.name_scope():
    net.add(MyDense(32, in_units=64))
    net.add(MyDense(2, in_units=32))
net.initialize()
net(nd.random.uniform(shape=(2,64)))


[[ 0.          0.04920261]
 [ 0.          0.0471992 ]]
<NDArray 2x2 @cpu(0)>

In [47]:
net[0].name

'sequential5_mydense0'

In [49]:
net[0].weight.data()


[[ 0.0598914   0.0138801   0.05596199 ..., -0.00225972  0.01277342
   0.06014002]
 [ 0.05154703 -0.04380481 -0.06335194 ..., -0.06190795  0.04354891
  -0.04029946]
 [-0.01228636  0.01871894  0.0484838  ..., -0.01373136  0.06271163
  -0.06682531]
 ..., 
 [-0.06251022 -0.03662002  0.03110864 ...,  0.03333841 -0.02996539
   0.02106849]
 [-0.00096778  0.02576673 -0.06881564 ...,  0.02622904  0.00440957
   0.02369227]
 [ 0.02678076  0.03483683 -0.04148536 ..., -0.02515849 -0.01115513
  -0.01808989]]
<NDArray 64x32 @cpu(0)>

## USE GPU

In [50]:
！nvidia-smi

SyntaxError: invalid character in identifier (<ipython-input-50-dc6b824163ab>, line 1)

In [51]:
# 创建在gpu上的数据
a = nd.array([1,2,3],ctx=mx.gpu())
b = nd.zeros(shape=(3,2),ctx=mx.gpu())
c = nd.random.uniform(shape=(2,3), ctx=mx.gpu())
(a,b,c)

(
 [ 1.  2.  3.]
 <NDArray 3 @gpu(0)>, 
 [[ 0.  0.]
  [ 0.  0.]
  [ 0.  0.]]
 <NDArray 3x2 @gpu(0)>, 
 [[ 0.32977498  0.43025011  0.70026755]
  [ 0.77781075  0.29912937  0.39169419]]
 <NDArray 2x3 @gpu(0)>)

In [52]:
# 使用其他gpu
nd.array([1,2,3], ctx=mx.gpu(2))


[ 1.  2.  3.]
<NDArray 3 @gpu(2)>

In [54]:
## CPU与GPU之间的数据拷贝

x = nd.random_normal(shape=(3,3))
y = x.copyto(mx.gpu())
z = x.as_in_context(mx.gpu())
(y,z)

(
 [[ 0.56268167 -0.44618151  1.1527468 ]
  [ 0.84410983 -0.36372346 -0.10490948]
  [ 1.83048832  1.16871405 -0.76237744]]
 <NDArray 3x3 @gpu(0)>, 
 [[ 0.56268167 -0.44618151  1.1527468 ]
  [ 0.84410983 -0.36372346 -0.10490948]
  [ 1.83048832  1.16871405 -0.76237744]]
 <NDArray 3x3 @gpu(0)>)

这两个函数的主要区别是，如果源和目标的context一致，as_in_context不复制，而copyto总是会新建内存：

## 在GPU上训练模型

1. net.initialize(ctx=mx.gpu())
2. data和label都在gpu上