# Model Parameters
The accession, initialization and sharing of the model parameters.

In [2]:
from mxnet import init, nd
from mxnet.gluon import nn 

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize() # use default initialization method.

X = nd.random.uniform(shape=(2, 20))
Y = net(X)

In [3]:
X, Y

(
 [[0.5488135  0.5928446  0.71518934 0.84426576 0.60276335 0.8579456
   0.5448832  0.8472517  0.4236548  0.6235637  0.6458941  0.3843817
   0.4375872  0.2975346  0.891773   0.05671298 0.96366274 0.2726563
   0.3834415  0.47766513]
  [0.79172504 0.8121687  0.5288949  0.47997716 0.56804454 0.3927848
   0.92559665 0.83607876 0.07103606 0.33739617 0.08712929 0.6481719
   0.0202184  0.36824155 0.83261985 0.95715517 0.77815676 0.14035077
   0.87001216 0.87008727]]
 <NDArray 2x20 @cpu(0)>, 
 [[ 0.09543004  0.04614332 -0.00286655 -0.07790346 -0.05130241  0.02942038
    0.08696645 -0.0190793  -0.04122177  0.05088576]
  [ 0.0769287   0.03099706  0.00856576 -0.044672   -0.06926838  0.09132431
    0.06786592 -0.06187843 -0.03436674  0.04234696]]
 <NDArray 2x10 @cpu(0)>)

# Access Model Parameters

In [4]:
net[0].params, type(net[0].params) # net[0] is the first added layer.

(dense0_ (
   Parameter dense0_weight (shape=(256, 20), dtype=float32)
   Parameter dense0_bias (shape=(256,), dtype=float32)
 ), mxnet.gluon.parameter.ParameterDict)

In [5]:
net[0].weight, net[0].bias # use properties weight and bias to access

(Parameter dense0_weight (shape=(256, 20), dtype=float32),
 Parameter dense0_bias (shape=(256,), dtype=float32))

In [6]:
net[0].weight.data(), net[0].bias.data() # use function data to get initialization ndarray

(
 [[ 0.06700657 -0.00369488  0.0418822  ... -0.05517294 -0.01194733
   -0.00369594]
  [-0.03296221 -0.04391347  0.03839272 ...  0.05636378  0.02545484
   -0.007007  ]
  [-0.0196689   0.01582889 -0.00881553 ...  0.01509629 -0.01908049
   -0.02449339]
  ...
  [ 0.00010955  0.0439323  -0.04911506 ...  0.06975312  0.0449558
   -0.03283203]
  [ 0.04106557  0.05671307 -0.00066976 ...  0.06387014 -0.01292654
    0.00974177]
  [ 0.00297424 -0.0281784  -0.06881659 ... -0.04047417  0.00457048
    0.05696651]]
 <NDArray 256x20 @cpu(0)>, 
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.

In [7]:
net[0].weight.grad()
# using function 'grad' to get the gradient ndarray of parameters.
# Here without backward pass calculation, all elements in gradient ndarray are still zero. 


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @cpu(0)>

In [8]:
net[1].bias.data() # the second added layer


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @cpu(0)>

In [9]:
net.collect_params()
# Using function 'collect_params', we can get all parameters in this multilayer percetorn.

sequential0_ (
  Parameter dense0_weight (shape=(256, 20), dtype=float32)
  Parameter dense0_bias (shape=(256,), dtype=float32)
  Parameter dense1_weight (shape=(10, 256), dtype=float32)
  Parameter dense1_bias (shape=(10,), dtype=float32)
)

In [10]:
net.collect_params('.*weight') # get all weight parameters
# It's easy to see that all weight parameters is end with 'weight' generally,
# such as dense0_weight and dense1_weight.
# Using regex could match them.

sequential0_ (
  Parameter dense0_weight (shape=(256, 20), dtype=float32)
  Parameter dense1_weight (shape=(10, 256), dtype=float32)
)

# Share Model Parameters

In [14]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
net.add(nn.Dense(8, activation='relu'),
       shared,
       nn.Dense(8, activation='relu', params=shared.params),
       nn.Dense(10))

net.initialize()

X = nd.random.uniform(shape=(2, 20))
Y = net(X)

net[1].weight.data() == net[2].weight.data()
# The second layer and the third layer share same weight parameters.
# And there's no doubt that the bias is same as well.


[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>