In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

Things you should automatically know and have memorized
- N = number of samples
- T = sequence length
- D = number of input features
- M = number of hidden units
- K = number of output units

In [2]:
# Make some data
N = 1
T = 10
D = 3
M = 5
K = 2
X = np.random.randn(N, T, D)

In [3]:
X

array([[[-0.2145126 , -0.82924034,  0.71837623],
        [ 0.20484409,  1.4874654 ,  0.41106831],
        [-1.43036434, -1.49053851, -0.10232902],
        [ 0.70392115, -1.01964812,  0.23025271],
        [-0.86756248,  1.41292937, -0.58304221],
        [ 1.07832312,  0.10191409, -0.13672317],
        [ 0.93970527, -1.52924849,  0.25860559],
        [-1.04300131,  0.95800718,  0.96767659],
        [ 0.27456985, -0.43342495,  0.81013363],
        [-0.30134274,  0.27743486, -0.02347497]]])

In [4]:
# Make an RNN
class SimpleRNN(nn.Module):
  def __init__(self, n_inputs, n_hidden, n_outputs):
    super(SimpleRNN, self).__init__()
    self.D = n_inputs
    self.M = n_hidden
    self.K = n_outputs
    self.rnn = nn.RNN(
        input_size=self.D,
        hidden_size=self.M,
        nonlinearity='tanh',
        batch_first=True)
    self.fc = nn.Linear(self.M, self.K)
  
  def forward(self, X):
    # initial hidden states
    h0 = torch.zeros(1, X.size(0), self.M)

    # get RNN unit output
    out, _ = self.rnn(X, h0)

    # we only want h(T) at the final time step
    # out = self.fc(out[:, -1, :])
    out = self.fc(out)
    return out

In [5]:
# Instantiate the model
model = SimpleRNN(n_inputs=D, n_hidden=M, n_outputs=K)

In [6]:
# Get the output
inputs = torch.from_numpy(X.astype(np.float32))
out = model(inputs)
out

tensor([[[-0.2239,  0.0630],
         [ 0.3475,  0.3545],
         [ 0.0122,  0.2874],
         [-0.2066, -0.0498],
         [ 0.3012,  0.3138],
         [-0.0663, -0.0452],
         [-0.0854, -0.1220],
         [ 0.2727,  0.4448],
         [ 0.1028,  0.2486],
         [ 0.2602,  0.3204]]], grad_fn=<AddBackward0>)

In [7]:
out.shape

torch.Size([1, 10, 2])

In [8]:
# Save for later
Yhats_torch = out.detach().numpy()

In [9]:
W_xh, W_hh, b_xh, b_hh = model.rnn.parameters()

In [10]:
W_xh.shape

torch.Size([5, 3])

In [11]:
W_xh

Parameter containing:
tensor([[-0.3133, -0.1370,  0.1228],
        [-0.4295,  0.3651, -0.1348],
        [ 0.0151, -0.0023, -0.1058],
        [-0.1520,  0.0842, -0.4323],
        [ 0.2072,  0.2821,  0.2544]], requires_grad=True)

In [12]:
W_xh = W_xh.data.numpy()
W_xh

array([[-0.31327462, -0.13696897,  0.12282461],
       [-0.42951876,  0.3650694 , -0.13484144],
       [ 0.01510549, -0.00233454, -0.10581511],
       [-0.15204582,  0.08415651, -0.43229157],
       [ 0.20716923,  0.28213263,  0.25436997]], dtype=float32)

In [13]:
b_xh = b_xh.data.numpy()
W_hh = W_hh.data.numpy()
b_hh = b_hh.data.numpy()

In [14]:
# Did we do it right?
W_xh.shape, b_xh.shape, W_hh.shape, b_hh.shape

((5, 3), (5,), (5, 5), (5,))

In [15]:
# Now get the FC layer weights
Wo, bo = model.fc.parameters()

In [16]:
Wo = Wo.data.numpy()
bo = bo.data.numpy()
Wo.shape, bo.shape

((2, 5), (2,))

In [17]:
# See if we can replicate the output
h_last = np.zeros(M) # initial hidden state
x = X[0] # the one and only sample
Yhats = np.zeros((T, K)) # where we store the outputs

for t in range(T):
  h = np.tanh(x[t].dot(W_xh.T) + b_xh + h_last.dot(W_hh.T) + b_hh)
  y = h.dot(Wo.T) + bo # we only care about this value on the last iteration
  Yhats[t] = y
  
  # important: assign h to h_last
  h_last = h

# print the final output
print(Yhats)

[[-0.22385887  0.06295842]
 [ 0.347535    0.35454032]
 [ 0.01222177  0.28742997]
 [-0.20658184 -0.04980624]
 [ 0.30122686  0.31377456]
 [-0.06628435 -0.04518584]
 [-0.08540991 -0.1220065 ]
 [ 0.27268155  0.44480194]
 [ 0.10280821  0.24861215]
 [ 0.2602204   0.32042775]]


In [18]:
# Check
np.allclose(Yhats, Yhats_torch)

True

In [None]:
# Bonus exercise: calculate the output for multiple samples at once (N > 1)