In [1]:
# -----------------------------------------
# BASIC RNN FROM SCRATCH (WITH EXPLANATIONS)
# -----------------------------------------

# We ONLY use numpy for math and matrices.
import numpy as np

# Makes results repeatable (same random numbers each run)
np.random.seed(42)


# ==========================
# 1) DATA PREPARATION
# ==========================

# Our tiny training text
text = "hello"

# Find all unique characters
chars = sorted(list(set(text)))

# Number of different characters
vocab_size = len(chars)

# Map each character -> number (index)
char_to_idx = {ch: i for i, ch in enumerate(chars)}

# Map number -> character (for later, when we generate)
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

# Convert text "hello" -> [h,e,l,l,o] -> [0,1,2,2,3] (example)
data = [char_to_idx[c] for c in text]


# ==========================
# 2) MODEL SETTINGS
# ==========================

# Size of the "memory" inside RNN
hidden_size = 16

# How fast weights change during learning
learning_rate = 0.1


# ==========================
# 3) MODEL WEIGHTS
# ==========================
# These are the values the network will learn.

# input -> hidden matrix
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01

# previous hidden -> new hidden (memory loop)
Whh = np.random.randn(hidden_size, hidden_size) * 0.01

# hidden -> output matrix
Why = np.random.randn(vocab_size, hidden_size) * 0.01

# bias for hidden layer
bh = np.zeros((hidden_size, 1))

# bias for output layer
by = np.zeros((vocab_size, 1))


# ==========================
# 4) HELPER FUNCTIONS
# ==========================

# Turns raw scores -> probabilities (softmax)
def softmax(x):
    e = np.exp(x - np.max(x))  # numerical stability trick
    return e / np.sum(e)

# Create a one-hot vector (like [0,0,1,0])
def one_hot(idx):
    x = np.zeros((vocab_size, 1))
    x[idx] = 1
    return x


# ==========================
# 5) FORWARD + BACKWARD PASS (TRAINING STEP)
# ==========================
# This function:
# 1) runs RNN forward
# 2) calculates loss
# 3) runs backward pass (BPTT)
# 4) returns gradients

def loss_and_grads(inputs, targets, h_prev):

    # Dictionaries to store values for backprop
    xs, hs, ys, ps = {}, {}, {}, {}

    # Previous hidden state (memory)
    hs[-1] = np.copy(h_prev)

    loss = 0

    # ---------- FORWARD ----------
    for t in range(len(inputs)):

        # Convert character id -> one-hot vector
        xs[t] = one_hot(inputs[t])

        # RNN formula:
        # h_t = tanh(Wxh*x + Whh*h_prev + b)
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)

        # Raw output scores
        ys[t] = np.dot(Why, hs[t]) + by

        # Convert to probabilities
        ps[t] = softmax(ys[t])

        # Cross-entropy loss
        loss += -np.log(ps[t][targets[t], 0])

    # ---------- BACKWARD ----------
    # Initialize all gradients as zero
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh  = np.zeros_like(bh)
    dby  = np.zeros_like(by)

    # Gradient flowing backward through time
    dh_next = np.zeros_like(hs[0])

    # Loop backwards through time
    for t in reversed(range(len(inputs))):

        # Start from probability output
        dy = np.copy(ps[t])

        # Subtract 1 from correct class (softmax derivative)
        dy[targets[t]] -= 1

        # Gradients for Why + output bias
        dWhy += np.dot(dy, hs[t].T)
        dby  += dy

        # Backprop through hidden layer
        dh = np.dot(Why.T, dy) + dh_next

        # Derivative of tanh
        dh_raw = (1 - hs[t] * hs[t]) * dh

        # Gradients wrt parameters
        dbh  += dh_raw
        dWxh += np.dot(dh_raw, xs[t].T)
        dWhh += np.dot(dh_raw, hs[t-1].T)

        # Pass gradient backward in time
        dh_next = np.dot(Whh.T, dh_raw)

    # Clip gradients (prevents exploding gradients)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)

    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


# ==========================
# 6) TRAIN LOOP
# ==========================

h_prev = np.zeros((hidden_size, 1))  # initial memory = zeros

for epoch in range(2000):

    # Inputs: h,e,l,l
    inputs  = data[:-1]

    # Targets: e,l,l,o
    targets = data[1:]

    # Run one training step
    loss, dWxh, dWhh, dWhy, dbh, dby, h_prev = loss_and_grads(inputs, targets, h_prev)

    # Update each parameter using gradient descent
    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh  -= learning_rate * dbh
    by  -= learning_rate * dby

    # Show training progress
    if epoch % 200 == 0:
        print(f"epoch {epoch}, loss = {loss:.4f}")


# ==========================
# 7) GENERATE TEXT
# ==========================
# Now letâ€™s see what the model learned.

def sample(seed_idx, length=5):

    x = one_hot(seed_idx)
    h = np.zeros((hidden_size, 1))
    output = []

    for t in range(length):

        # Forward pass through RNN
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = softmax(y)

        # Pick next char based on probabilities
        idx = np.random.choice(range(vocab_size), p=p.ravel())

        x = one_hot(idx)
        output.append(idx_to_char[idx])

    return "".join(output)


print("\nGenerated text:")
print(sample(char_to_idx['h'], 5))


epoch 0, loss = 5.5459
epoch 200, loss = 0.0132
epoch 400, loss = 0.0057
epoch 600, loss = 0.0036
epoch 800, loss = 0.0026
epoch 1000, loss = 0.0021
epoch 1200, loss = 0.0017
epoch 1400, loss = 0.0015
epoch 1600, loss = 0.0013
epoch 1800, loss = 0.0011

Generated text:
elloe


In [None]:
from google.colab import drive
drive.mount('/content/drive')