## Task 3:

Implement neural embeddings – either hardcode or softer version, using PyTorch

Watch RAM during training, especially for higher batch sizes (>=32)

This is a neural n-gram

### Hardcode
Hardcode version:
* No PyTorch, no ML libraries, only numpy (at least for the neural embeddings,
can use PyTorch, etc. for GPT implementation)
    * Can use Counter and defaultdict from Collections
    * We can, but do not have to hardcode the optimiser (can use Adam,
need to use at least SGD)
* Measure perplexity
* Implement early stopping (when validation error/loss diverges from training
error to avoid overfitting to training set) with patience
    * Do not need to optimise for patience, but can
    * Save top k (the amount that fits reasonably on your disk) of model
checkpoints (can name that file for validation score and iteration)

We want a neural embedding with conditional generation.


### Top-k sampling

In [None]:
import os, numpy as np, time
from collections import deque
from src.bpe import BPETokenizer
from src.neural_ngram import NeuralNGramHard

# ---------------------- Dataset utilities ----------------------
def tokenize_with_bpe(path, tokenizer: BPETokenizer):
    """Tokenize a text file into ids using a BPETokenizer object."""
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    return tokenizer.encode_text(text)


def make_ngram_batches(ids, n, batch_size):
    """Yield mini-batches of n-gram contexts and targets."""
    N = len(ids) - (n-1)
    idx = np.arange(N)
    np.random.shuffle(idx)
    for start in range(0, N, batch_size):
        batch = idx[start:start+batch_size]
        X = np.stack([ids[i:i+n-1] for i in batch])
        Y = np.array([ids[i+n-1] for i in batch])
        yield X, Y

# ---------------------- Training loop ----------------------
def train_ngram_model(train_ids, val_ids, merges, id_to_token, token_to_id,
                      n=5, embd=32, hidden=128, batch_size=32,
                      lr=0.3, epochs=20, patience=3, top_k=3):


    model = NeuralNGramHard(vocab_size=len(id_to_token), n=n,
                            embd=embd, hidden=hidden)

    best_val = float("inf")
    patience_counter = 0
    saved_ckpts = deque(maxlen=top_k)

    for epoch in range(1, epochs+1):
        losses = []
        for X, Y in make_ngram_batches(train_ids, n, batch_size):
            loss = model.train_batch(X, Y, lr=lr)
            losses.append(loss)
        train_loss = np.mean(losses)
        val_loss = model.eval_loss(np.array(val_ids), n=n)

        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")

        if val_loss < best_val:
            best_val = val_loss
            patience_counter = 0
            ckpt_path = f"checkpoints/epoch{epoch}_valloss{val_loss:.4f}.npz"
            os.makedirs("checkpoints", exist_ok=True)
            model.save(ckpt_path)
            saved_ckpts.append(ckpt_path)
            print(f"  [Saved checkpoint: {ckpt_path}]")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return model

# ---------------------- Perplexity ----------------------
def perplexity(model, ids, n):
    loss = model.eval_loss(np.array(ids), n=n)
    return np.exp(loss)

# ---------------------- Run ----------------------
if __name__ == "__main__":
    # Load tokenizer
    tok = BPETokenizer.load_json("artifacts/bpe_tokenizer.json")

    # Tokenize datasets
    train_ids = tokenize_with_bpe("Corpus/shakespeare_clean_train.txt", tok)
    val_ids   = tokenize_with_bpe("Corpus/shakespeare_clean_valid.txt", tok)
    test_ids  = tokenize_with_bpe("Corpus/shakespeare_clean_test.txt", tok)

    # Train neural n-gram
    model = train_ngram_model(train_ids, val_ids, tok.merges, tok.id_to_token, tok.token_to_id, n=5, embd=32)
    # Evaluate
    ppl = perplexity(model, test_ids, n=5)
    print(f"Test Perplexity = {ppl:.2f}")



Epoch 1: train=4.0273, val=3.4477
  [Saved checkpoint: checkpoints/epoch1_valloss3.4477.npz]
Epoch 2: train=3.2619, val=3.1913
  [Saved checkpoint: checkpoints/epoch2_valloss3.1913.npz]
Epoch 3: train=3.1048, val=3.1069
  [Saved checkpoint: checkpoints/epoch3_valloss3.1069.npz]
Epoch 4: train=3.0451, val=3.0853
  [Saved checkpoint: checkpoints/epoch4_valloss3.0853.npz]
Epoch 5: train=3.0120, val=3.0597
  [Saved checkpoint: checkpoints/epoch5_valloss3.0597.npz]
Epoch 6: train=2.9899, val=3.0395
  [Saved checkpoint: checkpoints/epoch6_valloss3.0395.npz]
Epoch 7: train=2.9756, val=3.0288
  [Saved checkpoint: checkpoints/epoch7_valloss3.0288.npz]
Epoch 8: train=2.9634, val=3.0254
  [Saved checkpoint: checkpoints/epoch8_valloss3.0254.npz]
Epoch 9: train=2.9541, val=3.0224
  [Saved checkpoint: checkpoints/epoch9_valloss3.0224.npz]
Epoch 10: train=2.9471, val=3.0114
  [Saved checkpoint: checkpoints/epoch10_valloss3.0114.npz]
Epoch 11: train=2.9404, val=3.0205
Epoch 12: train=2.9364, val=3.004

In [8]:
def generate_text(model, tokenizer, start_words: list, n=5, max_tokens=100, top_k=None, temperature=1.0):
    """
    Generate text using the trained NeuralNGramHard model and BPE tokenizer.
    """
    # Encode start words to BPE tokens
    start_tokens = tokenizer.encode_words(start_words)
    # Convert BPE tokens to IDs
    context_ids = [tokenizer.token_to_id[t] for t in start_tokens if t in tokenizer.token_to_id]

    generated_ids = context_ids.copy()

    for _ in range(max_tokens):
        if len(generated_ids) < n-1:
            context = [0]*(n-1 - len(generated_ids)) + generated_ids
        else:
            context = generated_ids[-(n-1):]

        next_id = model.generate_next(context, top_k=top_k, temperature=temperature)
        generated_ids.append(next_id)

        # Stop if <eos> token generated
        if tokenizer.id_to_token[next_id] == "<eos>":
            break

    # Decode token IDs to BPE tokens, then to words
    decoded_words = tokenizer.decode_tokens([tokenizer.id_to_token[i] for i in generated_ids])
    return " ".join(decoded_words)


In [9]:
start_context = ["to", "be"]
text = generate_text(model, tok, start_context, n=5, max_tokens=50, top_k=10, temperature=0.8)
print(text)


to be of the charmian the certain of a bassander of his sopt and crownd plead to the crow and the chaste
