## Task 3:

Implement neural embeddings – either hardcode or softer version, using PyTorch

Watch RAM during training, especially for higher batch sizes (>=32)

This is a neural n-gram

### Hardcode
Hardcode version:
* No PyTorch, no ML libraries, only numpy (at least for the neural embeddings,
can use PyTorch, etc. for GPT implementation)
    * Can use Counter and defaultdict from Collections
    * We can, but do not have to hardcode the optimiser (can use Adam,
need to use at least SGD)
* Measure perplexity
* Implement early stopping (when validation error/loss diverges from training
error to avoid overfitting to training set) with patience
    * Do not need to optimise for patience, but can
    * Save top k (the amount that fits reasonably on your disk) of model
checkpoints (can name that file for validation score and iteration)

We want a neural embedding with conditional generation.


### Top-k sampling

In [None]:
import os, numpy as np, time
from collections import deque
from src.bpe import BPETokenizer
from src.neural_ngram import NeuralNGramHard

# ---------------------- Dataset utilities ----------------------
def tokenize_with_bpe(path, tokenizer: BPETokenizer):
    """Tokenize a text file into ids using a BPETokenizer object."""
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    return tokenizer.encode_text(text)


def make_ngram_batches(ids, n, batch_size):
    """Yield mini-batches of n-gram contexts and targets."""
    N = len(ids) - (n-1)
    idx = np.arange(N)
    np.random.shuffle(idx)
    for start in range(0, N, batch_size):
        batch = idx[start:start+batch_size]
        X = np.stack([ids[i:i+n-1] for i in batch])
        Y = np.array([ids[i+n-1] for i in batch])
        yield X, Y

# ---------------------- Training loop ----------------------
def train_ngram_model(train_ids, val_ids, id_to_token,
                      n=5, embd=32, hidden=128, batch_size=32,
                      lr=0.3, epochs=20, patience=3, top_k=3):

    model = NeuralNGramHard(vocab_size=len(id_to_token), n=n,
                            embd=embd, hidden=hidden)

    best_val = float("inf")
    patience_counter = 0
    saved_ckpts = deque(maxlen=top_k)

    for epoch in range(1, epochs+1):
        losses = []
        for X, Y in make_ngram_batches(train_ids, n, batch_size):
            loss = model.train_batch(X, Y, lr=lr)
            losses.append(loss)
        train_loss = np.mean(losses)
        val_loss = model.eval_loss(np.array(val_ids), n=n)

        print(f"Epoch {epoch}: train={train_loss:.4f}, val={val_loss:.4f}")

        if val_loss < best_val:
            best_val = val_loss
            patience_counter = 0
            ckpt_path = f"checkpoints/epoch{epoch}_valloss{val_loss:.4f}.npz"
            os.makedirs("checkpoints", exist_ok=True)
            model.save(ckpt_path)
            saved_ckpts.append(ckpt_path)
            print(f"  [Saved checkpoint: {ckpt_path}]")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return model

# ---------------------- Perplexity ----------------------
def perplexity(model, ids, n):
    loss = model.eval_loss(np.array(ids), n=n)
    return np.exp(loss)

# ---------------------- Run ----------------------
if __name__ == "__main__":
    # Load tokenizer
    tok = BPETokenizer.load_json("artifacts/bpe_tokenizer.json")

    # Tokenize datasets
    train_ids = tokenize_with_bpe("Corpus/shakespeare_clean_train.txt", tok)
    val_ids   = tokenize_with_bpe("Corpus/shakespeare_clean_valid.txt", tok)
    test_ids  = tokenize_with_bpe("Corpus/shakespeare_clean_test.txt", tok)

    # Train neural n-gram
    model = train_ngram_model(train_ids, val_ids, n=5, embd=32, hidden=128)

    # Evaluate
    ppl = perplexity(model, test_ids, n=5)
    print(f"Test Perplexity = {ppl:.2f}")



TypeError: train_ngram_model() missing 3 required positional arguments: 'merges', 'id_to_token', and 'token_to_id'

In [2]:
from src.bpe import BPETokenizer

# Train
tok = BPETokenizer()
tok.fit("Corpus/shakespeare_clean_train.txt", k=200)  # k = number of merges
tok.save_json("artifacts/bpe_tokenizer.json")


In [None]:
"""
Hardcoded Neural n-gram trainer (NumPy)
- No PyTorch; all model math + gradients in NumPy.
- Saves CSV, PNG, checkpoints (.npz), samples.
- Early stopping with patience; keeps top-k checkpoints by val loss.
Usage (terminal):
    python hardcode_neural_ngram.py --k 1000 --n 4 --embd 64 --hidden 128 --batch_size 512 --max_epochs 30
In Jupyter the CLI is safe.
"""

import os, sys, json, time, random, math, argparse, csv
from dataclasses import dataclass, asdict
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Optional

import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# -------------------------- Tokenizer / BPE helpers -------------------------
WORD_END = "</w>"
EOS = "<eos>"
BOS = "<bos>"
_wsre = __import__("re").compile(r"\s+")

def find_merges_file(k: int, verbose: bool = True):
    candidates = [
        os.path.join("Generated_tokens", f"bpe_merges with k = {k}.txt"),
        os.path.join("Generated_tokens", f"standard_bpe_merges_k{k}.txt"),
        os.path.join("Generated_tokens", f"aggressive_clean_bpe_merges_k{k}.txt"),
        os.path.join("Generated_tokens", f"bpe_merges_k{k}.txt"),
        os.path.join("Generated_tokens", f"bpe_merges_k{k}_webtext_clean.txt"),
    ]
    for p in candidates:
        if os.path.exists(p):
            if verbose:
                print("[Found merges]", p)
            return p
    raise FileNotFoundError(f"No merges file found for k={k}. Tried: {candidates}")

def load_merges(path: str) -> List[Tuple[str, str]]:
    merges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                merges.append((parts[0], parts[1]))
    return merges

def words_from_text(text: str, lowercase: bool = True) -> List[str]:
    if lowercase:
        text = text.lower()
    return [w for w in _wsre.split(text.strip()) if w]

def apply_merges_to_word(word: str, merges: List[Tuple[str, str]]) -> List[str]:
    symbols = tuple(list(word) + [WORD_END])
    for a, b in merges:
        out = []
        i, L = 0, len(symbols)
        while i < L:
            if i < L-1 and symbols[i] == a and symbols[i+1] == b:
                out.append(a + b); i += 2
            else:
                out.append(symbols[i]); i += 1
        symbols = tuple(out)
    return list(symbols)

def tokenize_lines_with_merges(text: str, merges: List[Tuple[str, str]]) -> List[List[str]]:
    token_lines = []
    for line in text.strip().splitlines():
        words = words_from_text(line)
        if not words: continue
        toks = []
        for w in words:
            toks.extend(apply_merges_to_word(w, merges))
        toks.append(EOS)
        token_lines.append(toks)
    return token_lines

def build_vocab_from_texts(texts: Dict[str, str], merges: List[Tuple[str, str]], extras: List[str]=None):
    vocab = set()
    for name, txt in texts.items():
        for ln in tokenize_lines_with_merges(txt, merges):
            vocab.update(ln)
    if extras:
        vocab.update(extras)
    id_to_token = sorted(vocab)
    token_to_id = {t:i for i,t in enumerate(id_to_token)}
    return id_to_token, token_to_id

def encode_lines_to_ids(text: str, merges: List[Tuple[str, str]], token_to_id: Dict[str,int]) -> List[int]:
    ids = []
    for ln in tokenize_lines_with_merges(text, merges):
        for t in ln:
            if t in token_to_id:
                ids.append(token_to_id[t])
    return ids

# --------------------------- NumPy model utilities --------------------------
def one_hot(idx: np.ndarray, dim: int):
    # idx: (B,) or (B,1) etc -> returns (B, dim)
    x = np.zeros((idx.shape[0], dim), dtype=np.float32)
    x[np.arange(idx.shape[0]), idx.ravel()] = 1.0
    return x

# --------------------------- Neural n-gram model ---------------------------
@dataclass
class HardModelParams:
    """Container for model weight matrices; simple to save/load via np.savez"""
    # Will store as dict of arrays

class NeuralNGramHard:
    """
    Neural n-gram implemented in numpy.
    Architecture:
      - Embedding matrix: (V, embd)
      - Linear1: ((n-1)*embd, hidden)
      - ReLU
      - Linear2: (hidden, vocab) -> logits
    """
    def __init__(self, vocab_size:int, n:int, embd:int, hidden:int, seed:int=1337):
        assert n >= 2
        self.vocab_size = vocab_size
        self.n = n
        self.embd = embd
        self.hidden = hidden
        rng = np.random.RandomState(seed)

        # params
        self.E = rng.normal(0, 0.02, size=(vocab_size, embd)).astype(np.float32)  # embeddings
        self.W1 = rng.normal(0, 0.02, size=((n-1)*embd, hidden)).astype(np.float32)
        self.b1 = np.zeros((hidden,), dtype=np.float32)
        self.W2 = rng.no


In [None]:
def top_k_sample(probs, k):
    """
    Top-k sampling from a probability distribution.
    Args:
        probs: 1D numpy array of probabilities for each word in the vocabulary.
        k: number of top words to consider.
    Returns:
        index of the sampled word.
    """
    if k <= 0:
        raise ValueError("k must be positive")
    # Get indices of top k probabilities
    top_k_indices = probs.argsort()[-k:][::-1]
    # Select top k probabilities and renormalize
    top_k_probs = probs[top_k_indices]
    top_k_probs = top_k_probs / top_k_probs.sum()
    # Sample from the top k
    sampled_idx = np.random.choice(top_k_indices, p=top_k_probs)
    return sampled_idx

### Temperature sampling

### Softer Version

* Using PyTorch
* Implement early stopping (when validation error/loss diverges from training
error to avoid overfitting to training set) with patience
    * Do not have to optimise patience, but can
    * Save top k (the amount that fits reasonably on disk) of model
checkpoints (can name that file for validation score and iteration)
* Tune hyperparameters using a grid search for each separately and
validation set (order is important: number of merges, learning rate, weights of
interpolation) – do not have to do all of this to pass, but for 1.0
    * vocabulary size – gridsearch for max. 10 different amounts of merges
    * learning rate of optimiser
    * interpolation
* Try versions with different optimisers