In [None]:
import numpy as np

# Hyperparameters
np.random.seed(1)
vocab_size = 5       # numbers 0–4
hidden_size = 10     # hidden units
learning_rate = 0.1
epochs = 200

# Training data: sequences
X = [0, 1, 2, 3]
Y = [1, 2, 3, 4]

# One-hot encoding
def one_hot(idx, size):
    vec = np.zeros((size, 1))
    vec[idx] = 1
    return vec

# Initialize weights
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

# Training loop
for epoch in range(epochs):
    h = np.zeros((hidden_size, 1))
    loss = 0

    # Store values for backprop
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = h

    # Forward pass
    for t in range(len(X)):
        xs[t] = one_hot(X[t], vocab_size)
        h = np.tanh(Wxh @ xs[t] + Whh @ hs[t-1] + bh)
        y = Why @ h + by
        p = np.exp(y) / np.sum(np.exp(y))
        loss -= np.log(p[Y[t], 0])   # cross entropy
        hs[t], ys[t], ps[t] = h, y, p

    # Backpropagation through time
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dh_next = np.zeros_like(h)

    for t in reversed(range(len(X))):
        dy = np.copy(ps[t])
        dy[Y[t]] -= 1   # derivative of loss wrt softmax
        dWhy += dy @ hs[t].T
        dby += dy
        dh = Why.T @ dy + dh_next
        dh_raw = (1 - hs[t] * hs[t]) * dh
        dbh += dh_raw
        dWxh += dh_raw @ xs[t].T
        dWhh += dh_raw @ hs[t-1].T
        dh_next = Whh.T @ dh_raw

    # Update weights
    for param, dparam in zip([Wxh, Whh, Why, bh, by],
                             [dWxh, dWhh, dWhy, dbh, dby]):
        param -= learning_rate * dparam

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Test prediction
print("\nTesting prediction:")
h = np.zeros((hidden_size, 1))
for t in range(len(X)):
    x = one_hot(X[t], vocab_size)
    h = np.tanh(Wxh @ x + Whh @ h + bh)
    y = Why @ h + by
    p = np.exp(y) / np.sum(np.exp(y))
    pred = np.argmax(p)
    print(f"Input {X[t]} → Predicted {pred}, True {Y[t]}")


Epoch 0, Loss: 6.4376
Epoch 50, Loss: 0.6656
Epoch 100, Loss: 0.0741
Epoch 150, Loss: 0.0370

Testing prediction:
Input 0 → Predicted 1, True 1
Input 1 → Predicted 2, True 2
Input 2 → Predicted 3, True 3
Input 3 → Predicted 4, True 4
