In [1]:
import numpy as np
np.set_printoptions(precision=3)

In [2]:
import mxnet as mx
from mxnet import autograd, nd
from mxnet.gluon import nn, rnn

## Understanding Vanilla RNN

In [5]:
n_hidden_state = 10
embedding_input = 5
model = rnn.RNN(n_hidden_state, 1, layout = 'NTC', input_size = embedding_input \
              , prefix='mdl_')
model.collect_params().initialize(mx.init.Xavier(), ctx = mx.cpu())
initial_state = model.begin_state(batch_size = 16)

### Hiddens state size

In [6]:
print(model.params['mdl_l0_h2h_weight'].data().shape)
print(model.params['mdl_l0_h2h_bias'].data().shape)
print(model.params['mdl_l0_i2h_weight'].data().shape)
print(model.params['mdl_l0_i2h_bias'].data().shape)

(10, 10)
(10,)
(10, 5)
(10,)


### $T=1$

In [5]:
# Goes only 1 time-step
batch_size = 16
time_step = 1
dat = nd.random.normal(shape =(batch_size, time_step, embedding_input))
out, state = model(dat, initial_state)

In [6]:
out[0][0] == state[0][0][0]


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 10 @cpu(0)>

In [7]:
h2h_weight = model.params['mdl_l0_h2h_weight'].data()
h2h_bias = model.params['mdl_l0_h2h_bias'].data()
i2h_weight = model.params['mdl_l0_i2h_weight'].data()
i2h_bias = model.params['mdl_l0_i2h_bias'].data()

In [8]:
res = nd.relu(nd.dot(dat[0][0], i2h_weight, transpose_b = True) + i2h_bias \
      + nd.dot(h2h_weight, initial_state[0][0][0]) + h2h_bias)

In [9]:
print(res.asnumpy())
print(out[0][0].asnumpy())
print(state[0][0][0].asnumpy())

[0.652 0.103 0.    1.276 0.682 0.77  0.788 0.994 0.    0.764]
[0.652 0.103 0.    1.276 0.682 0.77  0.788 0.994 0.    0.764]
[0.652 0.103 0.    1.276 0.682 0.77  0.788 0.994 0.    0.764]


### $T=2$

In [10]:
# Goes only 1 time-step
batch_size = 16
time_step = 2
dat = nd.random.normal(shape =(batch_size, time_step, embedding_input))
out, state = model(dat, initial_state)

> NOTE:  state is the same as the output of the last time step

In [11]:
print(out[0][1].asnumpy())
print(state[0][0][0].asnumpy())

[0.    0.    0.372 0.    0.    0.    0.    0.    0.383 0.   ]
[0.    0.    0.372 0.    0.    0.    0.    0.    0.383 0.   ]


In [12]:
h2h_weight = model.params['mdl_l0_h2h_weight'].data()
h2h_bias = model.params['mdl_l0_h2h_bias'].data()
i2h_weight = model.params['mdl_l0_i2h_weight'].data()
i2h_bias = model.params['mdl_l0_i2h_bias'].data()

In [13]:
out_t1 = nd.relu(nd.dot(dat[0][0], i2h_weight, transpose_b = True) + i2h_bias \
      + nd.dot(h2h_weight, initial_state[0][0][0]) + h2h_bias)

In [14]:
print(out_t1.asnumpy())
print(out[0][0].asnumpy())

[0.947 0.522 0.032 0.    0.    0.    0.    0.463 1.607 0.   ]
[0.947 0.522 0.032 0.    0.    0.    0.    0.463 1.607 0.   ]


In [15]:
out_t2 = nd.relu(nd.dot(dat[0][1], i2h_weight, transpose_b = True) + i2h_bias \
      + nd.dot(h2h_weight, out_t1) + h2h_bias)

In [16]:
print(out_t2.asnumpy()) # calculation
print(state[0][0][0].asnumpy()) # State at last time step
print(out[0][1].asnumpy()) # Last time step

[0.    0.    0.372 0.    0.    0.    0.    0.    0.383 0.   ]
[0.    0.    0.372 0.    0.    0.    0.    0.    0.383 0.   ]
[0.    0.    0.372 0.    0.    0.    0.    0.    0.383 0.   ]


## Bidirectional LSTM

In [17]:
n_hidden_state = 10
embedding_input = 5
model = rnn.RNN(n_hidden_state, 1, layout = 'NTC', input_size = embedding_input, bidirectional = True, prefix='mdl_')
model.collect_params().initialize(mx.init.Xavier(), ctx = mx.cpu())
initial_state = model.begin_state(batch_size = 16) # list of length 1 with shape (2, 16, 10)

In [18]:
model.params

mdl_ (
  Parameter mdl_l0_i2h_weight (shape=(10, 5), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_h2h_weight (shape=(10, 10), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_i2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_h2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_i2h_weight (shape=(10, 5), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_h2h_weight (shape=(10, 10), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_i2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_h2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

In [19]:
# Goes only 1 time-step
batch_size = 16
time_step = 2
dat = nd.random.normal(shape =(batch_size, time_step, embedding_input))
out, state = model(dat, initial_state)

In [20]:
# hidden state at t=1 (left, right) concatenated
out[0][0]


[0.231 0.35  0.    0.    0.    0.482 0.    0.    0.755 0.365 0.    0.293
 0.266 0.    0.    0.375 0.    0.    0.079 0.088]
<NDArray 20 @cpu(0)>

In [21]:
model.params

mdl_ (
  Parameter mdl_l0_i2h_weight (shape=(10, 5), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_h2h_weight (shape=(10, 10), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_i2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_l0_h2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_i2h_weight (shape=(10, 5), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_h2h_weight (shape=(10, 10), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_i2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
  Parameter mdl_r0_h2h_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

In [22]:
h2h_l_weight = model.params['mdl_l0_h2h_weight'].data()
h2h_l_bias = model.params['mdl_l0_h2h_bias'].data()
i2h_l_weight = model.params['mdl_l0_i2h_weight'].data()
i2h_l_bias = model.params['mdl_l0_i2h_bias'].data()
h2h_r_weight = model.params['mdl_r0_h2h_weight'].data()
h2h_r_bias = model.params['mdl_r0_h2h_bias'].data()
i2h_r_weight = model.params['mdl_r0_i2h_weight'].data()
i2h_r_bias = model.params['mdl_r0_i2h_bias'].data()

### $T = 1$

In [23]:
out_t1_l = nd.relu(nd.dot(dat[0][0], i2h_l_weight, transpose_b = True) + i2h_l_bias \
      + nd.dot(h2h_l_weight, initial_state[0][0][0]) + h2h_l_bias)
print(out_t1_l)


[0.231 0.35  0.    0.    0.    0.482 0.    0.    0.755 0.365]
<NDArray 10 @cpu(0)>


In [24]:

out_t2_r = nd.relu(nd.dot(dat[0][1], i2h_r_weight, transpose_b = True) + i2h_r_bias \
      + nd.dot(h2h_r_weight, initial_state[0][0][1]) + h2h_r_bias)
out_t1_r = nd.relu(nd.dot(dat[0][0], i2h_r_weight, transpose_b = True) + i2h_r_bias \
      + nd.dot(h2h_r_weight, out_t2_r) + h2h_r_bias)
print(out_t2_r)
print(out_t1_r)


[0.416 1.001 0.163 0.    0.299 0.    0.    0.163 0.    0.   ]
<NDArray 10 @cpu(0)>

[0.    0.293 0.266 0.    0.    0.375 0.    0.    0.079 0.088]
<NDArray 10 @cpu(0)>


In [25]:
print(out[0][0].asnumpy())
print(nd.concat(out_t1_l, out_t1_r, dim = 0).asnumpy())

[0.231 0.35  0.    0.    0.    0.482 0.    0.    0.755 0.365 0.    0.293
 0.266 0.    0.    0.375 0.    0.    0.079 0.088]
[0.231 0.35  0.    0.    0.    0.482 0.    0.    0.755 0.365 0.    0.293
 0.266 0.    0.    0.375 0.    0.    0.079 0.088]


### $T = 2$

In [26]:
out_t2_l = nd.relu(nd.dot(dat[0][1], i2h_l_weight, transpose_b = True) + i2h_l_bias \
      + nd.dot(h2h_l_weight, out_t1_l) + h2h_l_bias)
print(out_t2_l)


[0.    0.    0.388 0.    0.926 0.106 0.671 0.    0.    0.   ]
<NDArray 10 @cpu(0)>


In [27]:
print(out[0][1].asnumpy())
print(nd.concat(out_t2_l, out_t2_r, dim = 0).asnumpy())

[0.    0.    0.388 0.    0.926 0.106 0.671 0.    0.    0.    0.416 1.001
 0.163 0.    0.299 0.    0.    0.163 0.    0.   ]
[0.    0.    0.388 0.    0.926 0.106 0.671 0.    0.    0.    0.416 1.001
 0.163 0.    0.299 0.    0.    0.163 0.    0.   ]


> NOTE: Hidden state
>    * The final hidden state for left RNN is the one at $t = 2$
>    * The final hidden state for right RNN is the one at $t = 1$

In [28]:
print(nd.concat(out_t2_l, out_t1_r, dim = 0).asnumpy())
print(nd.concat(state[0][0][0], state[0][1][0], dim = 0).asnumpy())

[0.    0.    0.388 0.    0.926 0.106 0.671 0.    0.    0.    0.    0.293
 0.266 0.    0.    0.375 0.    0.    0.079 0.088]
[0.    0.    0.388 0.    0.926 0.106 0.671 0.    0.    0.    0.    0.293
 0.266 0.    0.    0.375 0.    0.    0.079 0.088]


## Stacking

In [4]:
n_hidden_state = 10
embedding_input = 5
model = rnn.RNN(n_hidden_state, 2, layout = 'NTC' \
              , input_size = embedding_input, bidirectional = False, prefix='mdl_')
model.collect_params().initialize(mx.init.Xavier(), ctx = mx.cpu())
initial_state = model.begin_state(batch_size = 16) # list of length 1 with shape (2, 16, 10)

In [15]:
i2h_weight_0 = model.params['mdl_l0_i2h_weight'].data()
h2h_weight_0 = model.params['mdl_l0_h2h_weight'].data()
i2h_bias_0 = model.params['mdl_l0_i2h_bias'].data()
h2h_bias_0 = model.params['mdl_l0_h2h_bias'].data()
i2h_weight_1 = model.params['mdl_l1_i2h_weight'].data()
h2h_weight_1 = model.params['mdl_l1_h2h_weight'].data()
i2h_bias_1 = model.params['mdl_l1_i2h_bias'].data()
h2h_bias_1 = model.params['mdl_l1_h2h_bias'].data()

In [16]:
# Goes only 1 time-step
batch_size = 16
time_step = 2
dat = nd.random.normal(shape =(batch_size, time_step, embedding_input))
out, state = model(dat, initial_state)

In [50]:
print(out[0][0].asnumpy()) # Layer 1 at time 1
print(out[0][1].asnumpy()) # Layer 1 at time 2

[0.061 0.    0.103 0.238 0.    0.    0.    0.    0.    0.   ]
[0.258 0.086 0.    0.372 0.    0.    0.002 0.212 0.    0.   ]


In [51]:
print(state[0][0][0].asnumpy()) # Layer 0 at time 2
print(state[0][1][0].asnumpy()) # Layer 1 at time 2

[0.    0.008 0.638 0.469 0.171 0.583 0.46  0.141 0.36  0.5  ]
[0.258 0.086 0.    0.372 0.    0.    0.002 0.212 0.    0.   ]


### $T=1$

In [52]:
out_0_t1 = nd.relu(nd.dot(dat[0][0], i2h_weight_0, transpose_b = True) + i2h_bias_0 \
      + nd.dot(h2h_weight_0, initial_state[0][0][0]) + h2h_bias_0)
out_1_t1 = nd.relu(nd.dot(out_0_t1, i2h_weight_1, transpose_b = True) + i2h_bias_1 \
      + nd.dot(h2h_weight_1, initial_state[0][1][0]) + h2h_bias_1)

print(out_0_t1.asnumpy())
print(out_1_t1.asnumpy())

[0.    0.    0.143 0.    0.045 0.    0.179 0.744 0.    0.   ]
[0.061 0.    0.103 0.238 0.    0.    0.    0.    0.    0.   ]


### $T=2$

In [53]:
out_0_t2 = nd.relu(nd.dot(dat[0][1], i2h_weight_0, transpose_b = True) + i2h_bias_0 \
      + nd.dot(h2h_weight_0, out_0_t1) + h2h_bias_0)
out_1_t2 = nd.relu(nd.dot(out_0_t2, i2h_weight_1, transpose_b = True) + i2h_bias_1 \
      + nd.dot(h2h_weight_1, out_1_t1) + h2h_bias_1)

print(out_0_t2.asnumpy())
print(out_1_t2.asnumpy())

[0.    0.008 0.638 0.469 0.171 0.583 0.46  0.141 0.36  0.5  ]
[0.258 0.086 0.    0.372 0.    0.    0.002 0.212 0.    0.   ]
