# RNN Sentiment Analysis (PyTorch)
Rotten Tomatoes • Model selection + Uni vs Bi + Embedding stability (vs GloVe)

This notebook implements a strong RNN-based baseline using **(Bi)GRU/LSTM + Attention pooling**.


In [None]:
# =========================
# Cell 1: Setup + data
# =========================
from datasets import load_dataset
import re, math, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Repro
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

rt = load_dataset("rotten_tomatoes")

# TextVectorization-lignende standardisering: lower + strip punctuation
# (Matcher ideen "lower_and_strip_punctuation" + whitespace split)
_punct_re = re.compile(r"[^\w\s']")  # bevar apostrof i "don't"
_space_re = re.compile(r"\s+")

def standardize_tf_like(s: str) -> str:
    s = s.lower()
    s = _punct_re.sub(" ", s)
    s = _space_re.sub(" ", s).strip()
    return s

def tokenize_tf_like(s: str):
    return standardize_tf_like(s).split()

from collections import Counter

def build_vocab_tf_like(texts, max_tokens=20000, min_freq=1):
    """
    TF-lignende:
    - special tokens: pad=0, oov=1
    - sortér efter frekvens desc, og alfabetisk ved ties (stabilt)
    """
    c = Counter()
    for t in texts:
        c.update(tokenize_tf_like(t))
    items = [(w, f) for w, f in c.items() if f >= min_freq]
    items.sort(key=lambda x: (-x[1], x[0]))  # freq desc, alpha asc

    stoi = {"<pad>": 0, "<unk>": 1}
    itos = ["<pad>", "<unk>"]
    for w, _ in items:
        if w in stoi:
            continue
        if len(itos) >= max_tokens:
            break
        stoi[w] = len(itos)
        itos.append(w)
    return stoi, itos

def encode(text, stoi, seq_len=60):
    toks = tokenize_tf_like(text)
    ids = [stoi.get(tok, 1) for tok in toks]  # 1 = <unk>
    if len(ids) < seq_len:
        ids = ids + [0] * (seq_len - len(ids))
    else:
        ids = ids[:seq_len]
    return np.asarray(ids, dtype=np.int64)

class RTDataset(Dataset):
    def __init__(self, split, stoi, seq_len=60):
        self.texts = split["text"]
        self.labels = split["label"]
        self.stoi = stoi
        self.seq_len = seq_len
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        x = encode(self.texts[idx], self.stoi, self.seq_len)
        y = float(self.labels[idx])
        return torch.from_numpy(x), torch.tensor(y, dtype=torch.float32)

def make_loaders(vocab_size=20000, seq_len=60, batch_size=64):
    stoi, itos = build_vocab_tf_like(rt["train"]["text"], max_tokens=vocab_size)
    train_ds = RTDataset(rt["train"], stoi, seq_len)
    val_ds   = RTDataset(rt["validation"], stoi, seq_len)
    test_ds  = RTDataset(rt["test"], stoi, seq_len)

    g = torch.Generator()
    g.manual_seed(SEED)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, generator=g, num_workers=0)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0)
    return train_loader, val_loader, test_loader, stoi, itos

train_loader, val_loader, test_loader, stoi, itos = make_loaders()
print("Vocab:", len(itos))


In [None]:
# =========================
# Cell 2: Model (GRU/LSTM + Attention Pooling)
# =========================
class AttentionPool(nn.Module):
    """
    Additiv attention over tidssteps.
    Input: H (B, T, D)
    Output: pooled (B, D)
    """
    def __init__(self, dim):
        super().__init__()
        self.proj = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, 1, bias=False)

    def forward(self, H, mask=None):
        scores = self.v(torch.tanh(self.proj(H))).squeeze(-1)  # (B, T)
        if mask is not None:
            scores = scores.masked_fill(~mask, -1e9)
        w = torch.softmax(scores, dim=1)  # (B, T)
        pooled = (H * w.unsqueeze(-1)).sum(dim=1)  # (B, D)
        return pooled, w

class RNNClassifier(nn.Module):
    def __init__(
        self,
        rnn_type="gru",
        vocab_size=20000,
        embed_dim=200,
        hidden_size=256,
        num_layers=1,
        dropout=0.35,
        bidirectional=True,
        pad_idx=0
    ):
        super().__init__()
        self.rnn_type = rnn_type.lower()
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        rnn_cls = nn.GRU if self.rnn_type == "gru" else nn.LSTM
        self.rnn = rnn_cls(
            input_size=embed_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0.0 if num_layers == 1 else dropout
        )

        out_dim = hidden_size * (2 if bidirectional else 1)
        self.attn = AttentionPool(out_dim)

        self.mlp = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        emb = self.embedding(x)          # (B, T, E)
        H, _ = self.rnn(emb)             # (B, T, D)
        mask = (x != 0)                  # pad-mask
        pooled, attn_w = self.attn(H, mask=mask)
        logits = self.mlp(pooled).squeeze(1)  # (B,)
        return logits, attn_w

def count_params(m):
    return sum(p.numel() for p in m.parameters() if p.requires_grad)


In [None]:
# =========================
# Cell 3: Train/eval + early stopping
# =========================
def acc_from_logits(logits, y):
    probs = torch.sigmoid(logits)
    preds = (probs >= 0.5).float()
    return (preds == y).float().mean().item()

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    losses, accs = [], []
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits, _ = model(x)
        loss = criterion(logits, y)
        losses.append(loss.item())
        accs.append(acc_from_logits(logits, y))
    return float(np.mean(losses)), float(np.mean(accs))

def train_one_epoch(model, loader, optimizer, criterion, grad_clip=1.0, use_amp=True):
    model.train()
    losses, accs = [], []
    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=(use_amp and device.type == "cuda")):
            logits, _ = model(x)
            loss = criterion(logits, y)

        scaler.scale(loss).backward()
        if grad_clip is not None:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()

        losses.append(loss.item())
        accs.append(acc_from_logits(logits, y))

    return float(np.mean(losses)), float(np.mean(accs))

def fit(config, epochs=12, batch_size=64, patience=2, verbose=False):
    train_loader, val_loader, test_loader, stoi, itos = make_loaders(
        vocab_size=config["vocab_size"],
        seq_len=config["seq_len"],
        batch_size=batch_size
    )

    model = RNNClassifier(
        rnn_type=config["rnn_type"],
        vocab_size=config["vocab_size"],
        embed_dim=config["embed_dim"],
        hidden_size=config["hidden_size"],
        num_layers=config["num_layers"],
        dropout=config["dropout"],
        bidirectional=config["bidirectional"]
    ).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"]
    )

    hist = {"train_loss":[], "train_acc":[], "val_loss":[], "val_acc":[]}
    best_val_loss = float("inf")
    best_state = None
    bad = 0

    for ep in range(1, epochs+1):
        tr_l, tr_a = train_one_epoch(model, train_loader, optimizer, criterion,
                                     grad_clip=config["grad_clip"], use_amp=True)
        va_l, va_a = evaluate(model, val_loader, criterion)

        hist["train_loss"].append(tr_l); hist["train_acc"].append(tr_a)
        hist["val_loss"].append(va_l);   hist["val_acc"].append(va_a)

        if va_l < best_val_loss - 1e-5:
            best_val_loss = va_l
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

        if verbose:
            print(f"ep {ep:02d} | tr loss {tr_l:.4f} acc {tr_a:.3f} | va loss {va_l:.4f} acc {va_a:.3f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, hist, (train_loader, val_loader, test_loader, stoi, itos)


## Hyperparameter search + visualisering
Vi laver et kontrolleret sweep over hidden size, dropout og learning rate på en UNI-model.


In [None]:
# =========================
# Cell 4: Hyperparameter search + visualisering
# =========================
base = dict(
    rnn_type="gru",
    vocab_size=20000,
    seq_len=60,
    embed_dim=200,
    hidden_size=256,
    num_layers=1,
    dropout=0.35,
    bidirectional=False,   # UNI for fair comparison later
    lr=2e-3,
    weight_decay=1e-2,
    grad_clip=1.0,
)

grid = []
for hidden in [128, 256, 384]:
    for dropout in [0.25, 0.35, 0.45]:
        for lr in [1e-3, 2e-3]:
            cfg = dict(base)
            cfg["hidden_size"] = hidden
            cfg["dropout"] = dropout
            cfg["lr"] = lr
            grid.append(cfg)

results = []
histories = []

for i, cfg in enumerate(grid):
    model, hist, pack = fit(cfg, epochs=12, batch_size=64, patience=2, verbose=False)
    best_val_acc = max(hist["val_acc"])
    best_val_loss = min(hist["val_loss"])
    results.append({
        "run": i,
        "hidden": cfg["hidden_size"],
        "dropout": cfg["dropout"],
        "lr": cfg["lr"],
        "best_val_acc": best_val_acc,
        "best_val_loss": best_val_loss,
        "params": count_params(model)
    })
    histories.append(hist)

df = pd.DataFrame(results).sort_values(["best_val_acc","best_val_loss"], ascending=[False, True]).reset_index(drop=True)
df.head(10)


In [None]:
# Visualiser hyperparameter-effekt: Val accuracy pr run
plt.figure()
plt.plot(df["best_val_acc"].values, marker="o")
plt.xlabel("Run (sorted by best val acc)")
plt.ylabel("Best val accuracy")
plt.grid(True)
plt.show()

# Heatmap-ish: groupby hidden/dropout og se gennemsnit val acc
pivot = df.pivot_table(index="hidden", columns="dropout", values="best_val_acc", aggfunc="mean")
print(pivot)

plt.figure()
plt.imshow(pivot.values, aspect="auto")
plt.xticks(range(pivot.shape[1]), pivot.columns)
plt.yticks(range(pivot.shape[0]), pivot.index)
plt.xlabel("Dropout")
plt.ylabel("Hidden size")
plt.title("Mean best val acc (UNI-GRU)")
plt.colorbar()
plt.show()


## Train best UNI + test + learning curves

In [None]:
# =========================
# Cell 5: Træn endelig "bedste UNI" + test
# =========================
best_uni = df.iloc[0].to_dict()

best_cfg_uni = None
for cfg in grid:
    if cfg["hidden_size"] == best_uni["hidden"] and cfg["dropout"] == best_uni["dropout"] and cfg["lr"] == best_uni["lr"]:
        best_cfg_uni = cfg
        break

uni_model, uni_hist, uni_pack = fit(best_cfg_uni, epochs=12, batch_size=64, patience=2, verbose=True)
train_loader, val_loader, test_loader, uni_stoi, uni_itos = uni_pack

criterion = nn.BCEWithLogitsLoss()
test_loss, test_acc = evaluate(uni_model, test_loader, criterion)
print("UNI Test acc:", test_acc)
print("UNI Params:", count_params(uni_model))

plt.figure()
plt.plot(uni_hist["train_loss"], label="train_loss")
plt.plot(uni_hist["val_loss"], label="val_loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(); plt.grid(True); plt.show()

plt.figure()
plt.plot(uni_hist["train_acc"], label="train_acc")
plt.plot(uni_hist["val_acc"], label="val_acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(); plt.grid(True); plt.show()


## Uni vs Bi comparison (value of bidirectionality)

In [None]:
# =========================
# Cell 6: Bi-directional sammenligning (fair sammenligning)
# =========================
best_cfg_bi = dict(best_cfg_uni)
best_cfg_bi["bidirectional"] = True

bi_model, bi_hist, bi_pack = fit(best_cfg_bi, epochs=12, batch_size=64, patience=2, verbose=True)
_, _, bi_test_loader, bi_stoi, bi_itos = bi_pack

bi_test_loss, bi_test_acc = evaluate(bi_model, bi_test_loader, criterion)
print("BI Test acc:", bi_test_acc)
print("BI Params:", count_params(bi_model))

plt.figure()
plt.plot(uni_hist["val_acc"], label="UNI val_acc")
plt.plot(bi_hist["val_acc"], label="BI val_acc")
plt.xlabel("Epoch")
plt.ylabel("Val accuracy")
plt.legend(); plt.grid(True); plt.show()

delta_acc = bi_test_acc - test_acc
delta_params = count_params(bi_model) - count_params(uni_model)
print("Δ test acc:", delta_acc)
print("Δ params:", delta_params)


## Constructed examples where 'future context' matters

In [None]:
# =========================
# Cell 7: Eksempler hvor "end-of-sentence" betyder meget
# =========================
@torch.no_grad()
def predict_probs(model, texts, stoi, seq_len):
    model.eval()
    X = np.stack([encode(t, stoi, seq_len) for t in texts])
    x = torch.from_numpy(X).to(device)
    logits, _ = model(x)
    return torch.sigmoid(logits).detach().cpu().numpy()

examples = [
    "I thought the movie was great at first, but it isn't.",
    "This is not a good film despite the talented cast.",
    "The acting seems brilliant, until the ending ruins it.",
    "What a wonderful idea, executed so poorly.",
    "I laughed a lot, mostly at how bad it was.",
    "It looks promising although it never delivers.",
    "The plot is clever, not.",
    "This is the kind of film you recommend to your enemies.",
]

seq_len = best_cfg_uni["seq_len"]

uni_p = predict_probs(uni_model, examples, uni_stoi, seq_len)
bi_p  = predict_probs(bi_model,  examples, bi_stoi,  seq_len)

for t, up, bp in zip(examples, uni_p, bi_p):
    print(f"Text: {t}")
    print(f"  UNI prob(pos): {float(up):.3f}")
    print(f"  BI  prob(pos): {float(bp):.3f}\n")


## Embedding stability vs GloVe (dimension matched via PCA)

In [None]:
# =========================
# Cell 8: Embedding stability (vs GloVe) + dimension match + cosine + nearest neighbors
# =========================
!pip -q install gensim

import gensim.downloader as api
from numpy.linalg import norm
from sklearn.decomposition import PCA

glove = api.load("glove-twitter-25")  # 25-dim

model_for_emb = bi_model
stoi_for_emb  = bi_stoi
itos_for_emb  = bi_itos

E = model_for_emb.embedding.weight.detach().cpu().numpy()  # (V, embed_dim)

def cosine(a, b):
    return float(np.dot(a, b) / (norm(a)*norm(b) + 1e-12))

def get_vec(word):
    idx = stoi_for_emb.get(word, None)
    if idx is None:
        return None
    return E[idx]

target_words = ["plot", "acting", "cheap", "affordable", "excellent", "waste"]
words = [w for w in target_words if (w in stoi_for_emb) and (w in glove)]
print("Using words:", words)

shared = [w for w in itos_for_emb if (w in glove) and (w in stoi_for_emb)]
shared = shared[:3000]

X_model = np.stack([get_vec(w) for w in shared])
pca = PCA(n_components=25, random_state=SEED).fit(X_model)
E25 = pca.transform(E)

def vec25(word):
    return E25[stoi_for_emb[word]]

pairs = [
    ("cheap", "affordable"),
    ("plot", "acting"),
    ("plot", "cheap"),
    ("acting", "waste"),
    ("excellent", "waste"),
    ("plot", "excellent"),
]

for a,b in pairs:
    if a in words and b in words:
        sim_model = cosine(vec25(a), vec25(b))
        sim_glove = cosine(glove[a], glove[b])
        print(f"\n{a} vs {b}")
        print("  Model cosine:", round(sim_model, 3))
        print("  GloVe cosine:", round(sim_glove, 3))

def top_neighbors(word, topk=10):
    idx = stoi_for_emb[word]
    v = E25[idx]
    sims = []
    for i, w in enumerate(itos_for_emb):
        if i == idx or w in ("<pad>","<unk>"):
            continue
        sims.append((w, cosine(v, E25[i])))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:topk]

for w in words:
    print(f"\nNearest neighbors in learned embedding for '{w}':")
    for n, s in top_neighbors(w, topk=10):
        print(" ", n, round(s, 3))
