In [1]:
import numpy as np

In [5]:
class SimpleRNN:
    def __init__(self, n_features, n_nodes, activation=np.tanh):
        self.n_features = n_features
        self.n_nodes = n_nodes
        self.activation = activation

        #Initializing weights
        self.Wx = np.random.randn(n_features, n_nodes) * 0.01
        self.Wh = np.random.randn(n_nodes, n_nodes) * 0.01
        self.b = np.zeros(n_nodes)

    def forward(self, x, h0=None):
        batch_size, n_sequences, _ = x.shape

        #Initializing hidden state
        if h0 is None:
            h_t = np.zeros((batch_size, self.n_nodes))
        else:
            h_t = h0

        #Stored states for all timesteps
        self.h_list = []

        for t in range(n_sequences):
            x_t = x[:, t, :]  # (batch_size, n_features)

            a_t = x_t @ self.Wx + h_t @ self.Wh + self.b
            h_t = self.activation(a_t)

            self.h_list.append(h_t)

        #Final hidden state
        return h_t, self.h_list


In [7]:
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100 # (batch_size, n_sequences, n_features)
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100 # (n_features, n_nodes)
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100 # (n_nodes, n_nodes)
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h0 = np.zeros((batch_size, n_nodes)) # (batch_size, n_nodes)
b = np.array([1, 1, 1, 1]) # (n_nodes,)

In [8]:
forward(x)

array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

In [9]:
rnn = SimpleRNN(n_features, n_nodes)
rnn.Wx = w_x
rnn.Wh = w_h
rnn.b = b

h_final, h_list = rnn.forward(x, h0)
print("Final hidden state:\n", h_final)

Final hidden state:
 [[0.79494228 0.81839002 0.83939649 0.85584174]]


In [10]:
class SimpleRNN:
    def __init__(self, n_features, n_nodes, activation=np.tanh):
        self.n_features = n_features
        self.n_nodes = n_nodes
        self.activation = activation

        #Parameters
        self.Wx = np.random.randn(n_features, n_nodes) * 0.01
        self.Wh = np.random.randn(n_nodes, n_nodes) * 0.01
        self.b = np.zeros(n_nodes)

    def forward(self, x, h0=None):
        batch_size, n_sequences, _ = x.shape

        if h0 is None:
            h_t = np.zeros((batch_size, self.n_nodes))
        else:
            h_t = h0

        self.h_list = [h_t]
        self.a_list = []
        self.x = x

        for t in range(n_sequences):
            x_t = x[:, t, :]  # (batch_size, n_features)

            a_t = x_t @ self.Wx + h_t @ self.Wh + self.b
            h_t = self.activation(a_t)

            self.a_list.append(a_t)
            self.h_list.append(h_t)

        return h_t, self.h_list[1:]  # last hidden + all h_t

    def backward(self, dh_last, learning_rate=0.01):
        batch_size, n_sequences, _ = self.x.shape

        #Initializing grads
        dWx = np.zeros_like(self.Wx)
        dWh = np.zeros_like(self.Wh)
        db = np.zeros_like(self.b)

        dx = np.zeros_like(self.x)
        dh_next = dh_last  # starts from loss gradient at last timestep

        #Looping backwards through time
        for t in reversed(range(n_sequences)):
            a_t = self.a_list[t]
            h_prev = self.h_list[t]
            x_t = self.x[:, t, :]

            #Gradient wrt pre-activation
            dtanh = (1 - np.tanh(a_t) ** 2) * dh_next

            #Accumulate grads
            dWx += x_t.T @ dtanh
            dWh += h_prev.T @ dtanh
            db += dtanh.sum(axis=0)

            # Gradients to pass backward
            dx[:, t, :] = dtanh @ self.Wx.T
            dh_next = dtanh @ self.Wh.T

        # Normalize by batch size
        dWx /= batch_size
        dWh /= batch_size
        db /= batch_size

        # Update params
        self.Wx -= learning_rate * dWx
        self.Wh -= learning_rate * dWh
        self.b -= learning_rate * db

        return dx, dh_next, dWx, dWh, db

In [None]:
forward(x)

