# Language models
- Goal is to predict the next token from the context. 

## Evaluation Metrics

### Negative Log-Likelihood (NLL)  
$ \text{NLL} = -\sum_{i=1}^N \log P(x_i) $  
- Directly tied to the probability model.  
- Standard metric in language modeling.  
- Harder to interpret (no direct intuitive scale).  

### Perplexity (PP)  
$ \text{PP} = \exp\!\Bigl(-\frac{1}{N} \sum_{i=1}^N \log P(x_i)\Bigr) $  
- Easy to interpret as an “effective branching factor.”  
- Most common in language modeling.  
- Log scale can hide small differences.  
- Does not show which errors the model makes.  

### Cross-Entropy (CE)  
$ H(P, Q) = -\sum_{x} P(x)\,\log Q(x) $  
- Same as NLL but often measured in bits or nats.  
- Has a clear information-theoretic interpretation.  
- Lacks a simple “accuracy”-like interpretation.  

### Accuracy  
$ \text{Accuracy} = \frac{\# \text{correct predictions}}{\# \text{total predictions}} $  
- Simple and intuitive.  
- Not very informative in language modeling; ignores probability distributions over all possible tokens.  

### Character Level Count Based Models

In [14]:
import nltk
from nltk.corpus import gutenberg
import math
from collections import Counter
from tabulate import tabulate

# Ensure you've downloaded NLTK's Gutenberg corpus once:
# nltk.download('gutenberg')

# 1) Load a short text: Shakespeare's "Macbeth" (approx 100k chars total).
raw_text = gutenberg.raw('shakespeare-macbeth.txt')

# Truncate to ~15k chars for a small demonstration
raw_text = raw_text[:15000]

# Train/test split
split_idx = int(len(raw_text) * 0.8)
full_train_text = raw_text[:split_idx]
test_text = raw_text[split_idx:]

def build_bigram_model(text):
    """Count single chars & bigrams, compute probabilities."""
    char_counts = Counter(text)
    bigram_counts = Counter()
    for i in range(len(text) - 1):
        bigram = text[i : i + 2]
        bigram_counts[bigram] += 1
    
    bigram_prob = {}
    for bg, cnt in bigram_counts.items():
        curr_char = bg[0]
        # Probability = count(bigram) / count(curr_char)
        bigram_prob[bg] = cnt / char_counts[curr_char]
    
    return bigram_prob, char_counts

def evaluate(text, bigram_prob, char_counts):
    """Compute NLL (bits), Perplexity, and Accuracy on a given text."""
    # Precompute top next-char for accuracy
    best_next_char = {}
    for bg, prob in bigram_prob.items():
        c, nxt = bg[0], bg[1]
        if c not in best_next_char or prob > best_next_char[c][1]:
            best_next_char[c] = (nxt, prob)
    
    total_NLL_bits = 0.0
    correct = 0
    total_bigrams = 0
    
    for i in range(len(text) - 1):
        c = text[i]
        nxt = text[i + 1]
        bg = c + nxt
        
        if bg in bigram_prob:
            p = bigram_prob[bg]
        else:
            # Unseen fallback probability
            p = 1.0 / max(len(char_counts), 1)
        
        total_NLL_bits += -math.log2(p)
        total_bigrams += 1
        
        # Accuracy check
        if c in best_next_char:
            pred_char, _ = best_next_char[c]
            if pred_char == nxt:
                correct += 1
    
    if total_bigrams == 0:
        return float('inf'), float('inf'), 0.0
    
    avg_NLL_bits = total_NLL_bits / total_bigrams
    perplexity = 2 ** avg_NLL_bits
    accuracy = correct / total_bigrams
    return avg_NLL_bits, perplexity, accuracy

def show_next_char_probs(context_char, bigram_prob, top_n=5):
    """Display the top-N next-character probabilities for a given single-char context."""
    # Gather all bigrams starting with `context_char`
    candidates = [(bg[1], p) for bg, p in bigram_prob.items() if bg[0] == context_char]
    candidates.sort(key=lambda x: x[1], reverse=True)
    
    if not candidates:
        print(f"No learned next chars for context '{repr(context_char)}'")
        return
    
    print(f"Context: '{repr(context_char)}' => Next Char Probabilities (top {top_n}):")
    for rank, (ch, prob) in enumerate(candidates[:top_n], 1):
        visible_char = repr(ch) if ch in ['\n', ' '] else f"'{ch}'"
        print(f"  {rank:2d}. {visible_char} -> {prob:.4f}")
    print()

# 2) Try different N sizes for training
train_sizes = [100, 1000, 5000, len(full_train_text)]
rows = []
models_stored = {}  # to optionally inspect models afterwards

for N in train_sizes:
    partial_train_text = full_train_text[:N]
    
    # Build model on partial train
    bigram_prob, char_counts = build_bigram_model(partial_train_text)
    models_stored[N] = (bigram_prob, char_counts)
    
    # Evaluate on partial train
    train_nll, train_pp, train_acc = evaluate(partial_train_text, bigram_prob, char_counts)
    
    # Evaluate on test
    test_nll, test_pp, test_acc = evaluate(test_text, bigram_prob, char_counts)
    
    rows.append([
        f"N={N} (Train)",
        f"{train_nll:.3f}",
        f"{train_pp:.3f}",
        f"{train_acc:.3f}"
    ])
    rows.append([
        f"N={N} (Test)",
        f"{test_nll:.3f}",
        f"{test_pp:.3f}",
        f"{test_acc:.3f}"
    ])

# 3) Print results table
print(f"Full Train Text Length: {len(full_train_text)}")
print(f"Test Text Length:       {len(test_text)}\n")

headers = ["Data", "Avg NLL (bits)", "Perplexity", "Accuracy"]
print(tabulate(rows, headers=headers, tablefmt="github"))

# 4) Show example next-char probabilities using the largest trained model
bigram_prob_final, _ = models_stored[len(full_train_text)]
print("\nExample Next-Character Probabilities (using largest N model):")
for ctx_char in ["T", " ", "\n", "."]:
    show_next_char_probs(ctx_char, bigram_prob_final, top_n=5)


Full Train Text Length: 12000
Test Text Length:       3000

| Data            |   Avg NLL (bits) |   Perplexity |   Accuracy |
|-----------------|------------------|--------------|------------|
| N=100 (Train)   |            1.667 |        3.176 |      0.475 |
| N=100 (Test)    |            4.085 |       16.967 |      0.145 |
| N=1000 (Train)  |            3.062 |        8.352 |      0.325 |
| N=1000 (Test)   |            3.648 |       12.533 |      0.256 |
| N=5000 (Train)  |            3.432 |       10.795 |      0.285 |
| N=5000 (Test)   |            3.571 |       11.888 |      0.265 |
| N=12000 (Train) |            3.459 |       10.998 |      0.285 |
| N=12000 (Test)  |            3.503 |       11.337 |      0.268 |

Example Next-Character Probabilities (using largest N model):
Context: ''T'' => Next Char Probabilities (top 5):
   1. 'h' -> 0.6869
   2. 'r' -> 0.1010
   3. 'i' -> 0.0909
   4. 'o' -> 0.0707
   5. 'e' -> 0.0303

Context: '' '' => Next Char Probabilities (top 5):
   1

### Char Level RNN

In [26]:
import math
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models

import nltk
from nltk.corpus import gutenberg

# -----------------------
# 1) Load Macbeth
# -----------------------
nltk.download('gutenberg', quiet=True)

raw_text = gutenberg.raw('shakespeare-macbeth.txt')

# Let's truncate to ~50k chars for demonstration (adjust as you wish).
raw_text = raw_text[:50000]

# We'll define a function to train on partial sizes:
partial_sizes = [10000, 20000, 30000, len(raw_text)]  # incremental

# -----------------------
# 2) Hyperparameters
# -----------------------
SEQ_LEN = 60        # length of input sequence
EMBED_DIM = 64      # embedding size
RNN_UNITS = 128     # size of the RNN hidden state
EPOCHS_PER_STAGE = 10  # how many epochs to train at each stage
BATCH_SIZE = 64

# -----------------------
# 3) Build a RNN Model
# -----------------------
def build_char_model(vocab_size):
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, 
                         output_dim=EMBED_DIM, 
                         input_length=SEQ_LEN),
        layers.LSTM(RNN_UNITS, return_sequences=False),
        layers.Dense(vocab_size, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# -----------------------
# 4) Preprocessing
# -----------------------
def create_vocab(text):
    """Returns char2idx, idx2char for the given text."""
    chars = sorted(set(text))
    char2idx = {ch: i for i, ch in enumerate(chars)}
    idx2char = {i: ch for ch, i in char2idx.items()}
    return char2idx, idx2char

def build_dataset(text, seq_len, char2idx):
    """
    Convert text into (X, y) pairs:
      - X[i] is seq_len chars, 
      - y[i] is the next char 
    """
    X, y = [], []
    for i in range(len(text) - seq_len):
        seq = text[i : i + seq_len]
        nxt = text[i + seq_len]
        X.append([char2idx[ch] for ch in seq])
        y.append(char2idx[nxt])
    return np.array(X), np.array(y)

def sample_text(model, start_text, char2idx, idx2char, length=200):
    """
    Generate text from the model given a starting prompt.
    Uses the last SEQ_LEN chars as context for each step.
    """
    text_out = start_text
    for _ in range(length):
        # Take the last SEQ_LEN chars as input (pad if shorter)
        context = text_out[-SEQ_LEN:]
        if len(context) < SEQ_LEN:
            context = ' ' * (SEQ_LEN - len(context)) + context
        
        # Encode to IDs
        x_in = [char2idx.get(ch, 0) for ch in context]
        x_in = np.array([x_in])  # shape (1, SEQ_LEN)

        # Predict distribution
        preds = model.predict(x_in, verbose=0)[0]  # shape (vocab_size,)

        # Sample from the distribution
        next_idx = np.random.choice(np.arange(len(preds)), p=preds)
        next_char = idx2char[next_idx]
        text_out += next_char
    return text_out

# -----------------------
# Main Logic
# -----------------------
# We'll build one model and re-train it at each stage.
# Alternatively, you could build a new model from scratch at each stage, 
# but continuing training might show how performance changes as we add more data.
char2idx_full, idx2char_full = create_vocab(raw_text)
vocab_size = len(char2idx_full)
model = build_char_model(vocab_size)

previous_end = 0

for stage_idx, size in enumerate(partial_sizes, start=1):
    # 1) Extract partial text [0 : size]
    partial_text = raw_text[:size]

    # 2) Build dataset
    #    We must use the full vocabulary from the entire text, 
    #    or you can build a new vocab each time. 
    X_data, y_data = build_dataset(partial_text, SEQ_LEN, char2idx_full)

    # 3) Train/Val split (e.g. 90/10 for demonstration)
    split_i = int(0.9 * len(X_data))
    X_train, X_val = X_data[:split_i], X_data[split_i:]
    y_train, y_val = y_data[:split_i], y_data[split_i:]

    if len(X_train) == 0:
        print(f"\n[Stage {stage_idx}] Not enough data to train (size={size}). Skipping.\n")
        continue

    # 4) Train for some epochs
    print(f"\n[Stage {stage_idx}] Training on {len(X_data)} samples (size={size}).")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS_PER_STAGE,
        batch_size=BATCH_SIZE,
        verbose=1
    )

    # 5) Evaluate perplexity on val set
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    val_loss_bits = val_loss / math.log(2)
    val_pp = 2 ** val_loss_bits

    print(f"[Stage {stage_idx}] Val Loss (nats): {val_loss:.3f}")
    print(f"[Stage {stage_idx}] Val Loss (bits): {val_loss_bits:.3f}")
    print(f"[Stage {stage_idx}] Val Perplexity:  {val_pp:.3f}")
    print(f"[Stage {stage_idx}] Val Accuracy:    {val_acc:.3f}")

    # 6) Generate text from a prompt
    prompt = "MACBETH"
    gen = sample_text(model, prompt, char2idx_full, idx2char_full, length=200)
    print(f"\n[Stage {stage_idx}] Sample text (prompt: '{prompt}'):\n{gen}")
    print("-" * 80)



[Stage 1] Training on 9940 samples (size=10000).
Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.1517 - loss: 3.5174 - val_accuracy: 0.2002 - val_loss: 3.1286
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.2227 - loss: 2.9777 - val_accuracy: 0.2535 - val_loss: 2.7969
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.2660 - loss: 2.6602 - val_accuracy: 0.2636 - val_loss: 2.6455
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.3024 - loss: 2.4930 - val_accuracy: 0.2847 - val_loss: 2.5592
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.3202 - loss: 2.3642 - val_accuracy: 0.2988 - val_loss: 2.4705
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.3378 - loss: 2.3050 - val_accuracy

## Char Level LSTM

In [2]:
import math
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers, models

import nltk
from nltk.corpus import gutenberg

# 1) Load Macbeth
nltk.download('gutenberg', quiet=True)
raw_text = gutenberg.raw('shakespeare-macbeth.txt')
raw_text = raw_text[:50000]  # ~50k chars

# 2) Hyperparameters
partial_sizes = [10000, 20000, 30000, len(raw_text)]
SEQ_LEN = 60        
EMBED_DIM = 64      
RNN_UNITS = 128     
EPOCHS_PER_STAGE = 20  
BATCH_SIZE = 64

# 3) Build a char-level LSTM model
def build_char_model(vocab_size):
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, 
                         output_dim=EMBED_DIM, 
                         input_length=SEQ_LEN),
        layers.LSTM(RNN_UNITS, return_sequences=False),  # LSTM layer
        layers.Dense(vocab_size, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# 4) Utility functions
def create_vocab(text):
    chars = sorted(set(text))
    char2idx = {ch: i for i, ch in enumerate(chars)}
    idx2char = {i: ch for ch, i in char2idx.items()}
    return char2idx, idx2char

def build_dataset(text, seq_len, char2idx):
    X, y = [], []
    for i in range(len(text) - seq_len):
        seq = text[i : i + seq_len]
        nxt = text[i + seq_len]
        X.append([char2idx[ch] for ch in seq])
        y.append(char2idx[nxt])
    return np.array(X), np.array(y)

def sample_text(model, start_text, char2idx, idx2char, length=200):
    """Generate text from the model given a starting prompt."""
    text_out = start_text
    for _ in range(length):
        context = text_out[-SEQ_LEN:]
        # pad if needed
        if len(context) < SEQ_LEN:
            context = ' ' * (SEQ_LEN - len(context)) + context
        x_in = [char2idx.get(ch, 0) for ch in context]
        x_in = np.array([x_in])
        
        preds = model.predict(x_in, verbose=0)[0]
        next_idx = np.random.choice(len(preds), p=preds)
        next_char = idx2char[next_idx]
        text_out += next_char
    return text_out

# 5) Main logic
char2idx_full, idx2char_full = create_vocab(raw_text)
vocab_size = len(char2idx_full)
model = build_char_model(vocab_size)

for stage_idx, size in enumerate(partial_sizes, start=1):
    partial_text = raw_text[:size]
    X_data, y_data = build_dataset(partial_text, SEQ_LEN, char2idx_full)

    # Train/Val split
    split_i = int(0.9 * len(X_data))
    X_train, X_val = X_data[:split_i], X_data[split_i:]
    y_train, y_val = y_data[:split_i], y_data[split_i:]

    if len(X_train) == 0:
        print(f"\n[Stage {stage_idx}] Not enough data for size={size}. Skipping.")
        continue

    print(f"\n[Stage {stage_idx}] Training on {len(X_data)} samples (size={size}).")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS_PER_STAGE,
        batch_size=BATCH_SIZE,
        verbose=1
    )

    # Evaluate perplexity
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    val_loss_bits = val_loss / math.log(2)
    val_pp = 2 ** val_loss_bits

    print(f"[Stage {stage_idx}] Val Loss (nats): {val_loss:.3f}")
    print(f"[Stage {stage_idx}] Val Loss (bits): {val_loss_bits:.3f}")
    print(f"[Stage {stage_idx}] Val Perplexity:  {val_pp:.3f}")
    print(f"[Stage {stage_idx}] Val Accuracy:    {val_acc:.3f}")

    # Generate sample text
    prompt = "MACBETH"
    gen = sample_text(model, prompt, char2idx_full, idx2char_full, length=200)
    print(f"\n[Stage {stage_idx}] Sample text (prompt: '{prompt}'):\n{gen}")
    print("-"*80)



[Stage 1] Training on 9940 samples (size=10000).
Epoch 1/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.1552 - loss: 3.5300 - val_accuracy: 0.1700 - val_loss: 3.1257
Epoch 2/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.2130 - loss: 3.0135 - val_accuracy: 0.2596 - val_loss: 2.7582
Epoch 3/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.2861 - loss: 2.6178 - val_accuracy: 0.2757 - val_loss: 2.5914
Epoch 4/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.3129 - loss: 2.4314 - val_accuracy: 0.2877 - val_loss: 2.5219
Epoch 5/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.3279 - loss: 2.3343 - val_accuracy: 0.2938 - val_loss: 2.4778
Epoch 6/20
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.3448 - loss: 2.2721 - val_accuracy