Shubham Kukadiya - 202411066

Q2) Build an n-gram-based neural language model to predict the next word in a sentence.

1. Tokenize a small text dataset and generate n-gram (n=3 or 4) context-target pairs.
2. Use embedding layer + MLP with one hidden layer (ReLU activation).
3. Train the model with cross-entropy loss and SGD optimizer.
4. Track and print loss after each epoch to verify backpropagation and gradient descent are working.
5. Evaluate the model on a few custom test sentences.

In [7]:
import re
import random
from collections import Counter
from typing import List, Tuple
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

TXT_PATH = "/content/shakespeare.txt"
N = 4
EMB_DIM = 128
HIDDEN = 256
EPOCHS = 8
BATCH_SIZE = 256
LR = 0.5
MIN_FREQ = 1
TOPK = 5
SEED = 1337
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7dcb73fe0c10>

In [8]:

# Tokenization
WORD_RE = re.compile(r"[A-Za-z']+|[.,;:?!\-()\[\]\"“”’]")

def tokenize_line(line: str) -> List[str]:

    toks = WORD_RE.findall(line.lower())

    cleaned = [t.replace("’", "'").replace("“", '"').replace("”", '"') for t in toks]
    return [t for t in cleaned if t.strip()]

# Data prep: vocab + n-grams
SPECIALS = ["<pad>", "<unk>", "<bos>", "<eos>"]
PAD, UNK, BOS, EOS = range(4)

def build_vocab(lines: List[str], min_freq: int = 1):
    tokens = []
    for line in lines:
        tokens.extend(tokenize_line(line))
    freqs = Counter(tokens)
    itos = SPECIALS + [t for t, c in freqs.items() if c >= min_freq and t not in SPECIALS]
    stoi = {t: i for i, t in enumerate(itos)}
    return stoi, itos

def line_to_ids(line: str, stoi: dict) -> List[int]:
    return [stoi.get(tok, UNK) for tok in tokenize_line(line)]

def make_ngrams(lines: List[str], stoi: dict, n: int) -> Tuple[List[List[int]], List[int]]:

    contexts, targets = [], []
    for line in lines:
        ids = line_to_ids(line, stoi)
        ids_ext = ids + [EOS]
        ctx = [BOS] * (n - 1) + ids_ext
        for i in range(n - 1, len(ctx)):
            context = ctx[i-(n-1):i]
            target = ctx[i]
            contexts.append(context)
            targets.append(target)
    return contexts, targets



In [9]:

# Dataset
class NGramDataset(Dataset):
    def __init__(self, contexts: List[List[int]], targets: List[int]):
        self.x = torch.tensor(contexts, dtype=torch.long)
        self.y = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return self.x.size(0)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [10]:
# Model
class NGramMLP(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int, hidden: int, n: int):
        super().__init__()
        self.n_minus_1 = n - 1
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.ff = nn.Sequential(
            nn.Linear(emb_dim * self.n_minus_1, hidden),
            nn.ReLU(inplace=True),
            nn.Linear(hidden, vocab_size)
        )

    def forward(self, context_ids: torch.Tensor):
        # context_ids: (B, n-1)
        emb = self.emb(context_ids)             # (B, n-1, emb)
        flat = emb.reshape(emb.size(0), -1)     # (B, (n-1)*emb)
        logits = self.ff(flat)                  # (B, V)
        return logits


In [11]:


# Training / Eval
def load_lines(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().splitlines()

def split_train_valid(ctx, tgt, valid_frac=0.05):
    idx = list(range(len(tgt)))
    random.shuffle(idx)
    cut = int(len(idx) * (1 - valid_frac))
    tr_idx, va_idx = idx[:cut], idx[cut:]
    tr_x = [ctx[i] for i in tr_idx]
    tr_y = [tgt[i] for i in tr_idx]
    va_x = [ctx[i] for i in va_idx]
    va_y = [tgt[i] for i in va_idx]
    return (tr_x, tr_y), (va_x, va_y)

def epoch_run(model, loader, criterion, optimizer=None):
    is_train = optimizer is not None
    total, total_loss = 0, 0.0
    if is_train:
        model.train()
    else:
        model.eval()
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model(xb)
        loss = criterion(logits, yb)
        if is_train:
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
        total += yb.numel()
        total_loss += loss.item() * yb.size(0)
    return total_loss / max(total, 1)

@torch.no_grad()
def predict_next_words(model, prompt: str, stoi: dict, itos: List[str], n: int, k: int = 5):
    toks = tokenize_line(prompt)
    ids = [stoi.get(t, UNK) for t in toks][- (n - 1):]  #  last n-1 tokens
    ids = [BOS] * max(0, n - 1 - len(ids)) + ids
    ctx = torch.tensor([ids], dtype=torch.long, device=DEVICE)  # (1, n-1)
    logits = model(ctx)          # (1, V)
    probs = torch.softmax(logits[0], dim=-1)
    topk = torch.topk(probs, k=k)
    words = [itos[i] for i in topk.indices.tolist()]
    return list(zip(words, [float(p) for p in topk.values.tolist()]))



In [12]:
def main():
    print(f"The device is: {DEVICE}")
    lines = load_lines(TXT_PATH)
    print(f"Number of total lines={len(lines)} from {TXT_PATH}")

    # Build vocab
    stoi, itos = build_vocab(lines, min_freq=MIN_FREQ)
    vocab_size = len(itos)
    print(f"Vocab size={vocab_size}")

    # Build n-gram pairs
    contexts, targets = make_ngrams(lines, stoi, n=N)
    print(f"Pairs in total={len(targets)} (n={N} → context size={N-1})")

    # Train/valid split
    (tr_x, tr_y), (va_x, va_y) = split_train_valid(contexts, targets, valid_frac=0.05)

    train_ds = NGramDataset(tr_x, tr_y)
    valid_ds = NGramDataset(va_x, va_y)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
    valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

    # Model
    model = NGramMLP(vocab_size, EMB_DIM, HIDDEN, N).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)

    # Train
    for epoch in range(1, EPOCHS + 1):
        tr_loss = epoch_run(model, train_loader, criterion, optimizer)
        va_loss = epoch_run(model, valid_loader, criterion, optimizer=None)
        print(f"Epoch {epoch:02d}/{EPOCHS} | train_loss={tr_loss:.4f} | valid_loss={va_loss:.4f}")

    # custom prompts
    prompts = [
        "shall i compare",
        "in fair love",
        "when i do",
        "my love is",
        "time doth"
    ]

    print("\nTop-k next-word predictions")
    for s in prompts:
        preds = predict_next_words(model, s, stoi, itos, n=N, k=TOPK)
        pretty = ", ".join([f"{w} ({p:.3f})" for w, p in preds])
        print(f"  '{s}  →'  {pretty}")

if __name__ == "__main__":
    main()


The device is: cuda
Number of total lines=124185 from /content/shakespeare.txt
Vocab size=26980
Pairs in total=1241049 (n=4 → context size=3)
Epoch 01/8 | train_loss=5.4547 | valid_loss=5.2199
Epoch 02/8 | train_loss=5.0600 | valid_loss=5.1337
Epoch 03/8 | train_loss=4.9366 | valid_loss=5.1156
Epoch 04/8 | train_loss=4.8513 | valid_loss=5.0602
Epoch 05/8 | train_loss=4.7835 | valid_loss=5.0960
Epoch 06/8 | train_loss=4.7246 | valid_loss=5.0710
Epoch 07/8 | train_loss=4.6715 | valid_loss=5.0891
Epoch 08/8 | train_loss=4.6231 | valid_loss=5.0996

Top-k next-word predictions
  'shall i compare  →'  , (0.104), <eos> (0.070), ; (0.035), to (0.028), . (0.022)
  'in fair love  →'  , (0.378), . (0.112), <eos> (0.055), ; (0.028), - (0.020)
  'when i do  →'  not (0.075), see (0.052), love (0.044), believe (0.029), fear (0.027)
  'my love is  →'  dead (0.062), not (0.045), better (0.031), , (0.026), too (0.022)
  'time doth  →'  not (0.130), the (0.064), in (0.018), servant (0.016), , (0.013)
