## Task 3:

Implement neural embeddings – either hardcode or softer version, using PyTorch

Watch RAM during training, especially for higher batch sizes (>=32)

### Hardcode
Hardcode version:
* No PyTorch, no ML libraries, only numpy (at least for the neural embeddings,
can use PyTorch, etc. for GPT implementation)
    * Can use Counter and defaultdict from Collections
    * We can, but do not have to hardcode the optimiser (can use Adam,
need to use at least SGD)
* Measure perplexity
* Implement early stopping (when validation error/loss diverges from training
error to avoid overfitting to training set) with patience
    * Do not need to optimise for patience, but can
    * Save top k (the amount that fits reasonably on your disk) of model
checkpoints (can name that file for validation score and iteration)

In [None]:
import os, numpy as np, time
from collections import deque
from src.bpe import BPETokenizer
from src.neural_ngram import NeuralNGramHard
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
# Use a safe built-in font
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Arial', 'Liberation Sans', 'Tahoma']

# ---------------------- Dataset utilities ----------------------
def tokenize_with_bpe(path, tokenizer: BPETokenizer):
    """Tokenize a text file into ids using a BPETokenizer object."""
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    return tokenizer.encode_text(text)


def make_ngram_batches(ids, n, batch_size):
    """Yield mini-batches of n-gram contexts and targets."""
    N = len(ids) - (n-1)
    idx = np.arange(N)
    np.random.shuffle(idx)
    for start in range(0, N, batch_size):
        batch = idx[start:start+batch_size]
        X = np.stack([ids[i:i+n-1] for i in batch])
        Y = np.array([ids[i+n-1] for i in batch])
        yield X, Y

# ---------------------- Training loop ----------------------


def train_ngram_model(train_ids, val_ids, merges, id_to_token, token_to_id,
                      n=5, embd=32, hidden=128, batch_size=32,
                      lr=0.3, epochs=20, patience=3, top_k=3,
                      history_csv="ngram_history.csv"):

    model = NeuralNGramHard(vocab_size=len(id_to_token), n=n,
                            embd=embd, hidden=hidden)

    best_val = float("inf")
    patience_counter = 0
    saved_ckpts = deque(maxlen=top_k)

    history = []

    for epoch in range(1, epochs+1):
        losses = []
        for X, Y in make_ngram_batches(train_ids, n, batch_size):
            loss = model.train_batch(X, Y, lr=lr)
            losses.append(loss)
        train_loss = np.mean(losses)
        val_loss = model.eval_loss(np.array(val_ids), n=n)

        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")

        history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss})

        # Save history to CSV every epoch
        pd.DataFrame(history).to_csv(history_csv, index=False)

        if val_loss < best_val:
            best_val = val_loss
            patience_counter = 0
            ckpt_path = f"checkpoints/epoch{epoch}_valloss{val_loss:.4f}.npz"
            os.makedirs("checkpoints", exist_ok=True)
            model.save(ckpt_path)
            saved_ckpts.append(ckpt_path)
            print(f"  [Saved checkpoint: {ckpt_path}]")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return model, history

# ---------------- Plot from CSV ----------------

def plot_losses_from_csv(csv_path, save_path="ngram_losses.png"):
    df = pd.read_csv(csv_path)

    plt.figure(figsize=(10, 5))
    if "train_loss" in df.columns:
        plt.plot(df["epoch"], df["train_loss"], label="train_loss")
    if "val_loss" in df.columns:
        plt.plot(df["epoch"], df["val_loss"], label="val_loss")

    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.grid(True)

    plt.savefig(save_path)
    plt.close()
    print(f"Plot saved to {save_path}")



# ---------------------- Perplexity ----------------------
def perplexity(model, ids, n):
    loss = model.eval_loss(np.array(ids), n=n)
    return np.exp(loss)

# ---------------------- Run ----------------------
if __name__ == "__main__":
    # Load tokenizer
    tok = BPETokenizer.load_json("artifacts/bpe_tokenizer.json")

    # Tokenize datasets
    train_ids = tokenize_with_bpe("Corpus/shakespeare_clean_train.txt", tok)
    val_ids   = tokenize_with_bpe("Corpus/shakespeare_clean_valid.txt", tok)
    test_ids  = tokenize_with_bpe("Corpus/shakespeare_clean_test.txt", tok)

    # Train model and collect losses
    model, history = train_ngram_model(train_ids, val_ids, tok.merges, tok.id_to_token, tok.token_to_id, n=5, embd=32)
    # Evaluate
    ppl = perplexity(model, test_ids, n=5)
    print(f"Test Perplexity = {ppl:.2f}")
    # Plot losses

    

    

'# ---------------------- Run ----------------------\nif __name__ == "__main__":\n    # Load tokenizer\n    tok = BPETokenizer.load_json("artifacts/bpe_tokenizer.json")\n\n    # Tokenize datasets\n    train_ids = tokenize_with_bpe("Corpus/shakespeare_clean_train.txt", tok)\n    val_ids   = tokenize_with_bpe("Corpus/shakespeare_clean_valid.txt", tok)\n    test_ids  = tokenize_with_bpe("Corpus/shakespeare_clean_test.txt", tok)\n\n    # Train model and collect losses\n    model, history = train_ngram_model(train_ids, val_ids, tok.merges, tok.id_to_token, tok.token_to_id, n=5, embd=32)\n    # Evaluate\n    ppl = perplexity(model, test_ids, n=5)\n    print(f"Test Perplexity = {ppl:.2f}")\n    # Plot losses'

In [13]:
# then your plotting code...
plot_losses_from_csv("ngram_history.csv")


Plot saved to ngram_losses.png


In [8]:
def generate_text(model, tokenizer, start_words: list, n=5, max_tokens=100, top_k=None, temperature=1.0):
    """
    Generate text using the trained NeuralNGramHard model and BPE tokenizer.
    """
    # Encode start words to BPE tokens
    start_tokens = tokenizer.encode_words(start_words)
    # Convert BPE tokens to IDs
    context_ids = [tokenizer.token_to_id[t] for t in start_tokens if t in tokenizer.token_to_id]

    generated_ids = context_ids.copy()

    for _ in range(max_tokens):
        if len(generated_ids) < n-1:
            context = [0]*(n-1 - len(generated_ids)) + generated_ids
        else:
            context = generated_ids[-(n-1):]

        next_id = model.generate_next(context, top_k=top_k, temperature=temperature)
        generated_ids.append(next_id)

        # Stop if <eos> token generated
        if tokenizer.id_to_token[next_id] == "<eos>":
            break

    # Decode token IDs to BPE tokens, then to words
    decoded_words = tokenizer.decode_tokens([tokenizer.id_to_token[i] for i in generated_ids])
    return " ".join(decoded_words)


In [9]:
start_context = ["to", "be"]
text = generate_text(model, tok, start_context, n=5, max_tokens=50, top_k=10, temperature=0.8)
print(text)


to be of the charmian the certain of a bassander of his sopt and crownd plead to the crow and the chaste
