Note: Assistant reviewing the LSTM class implementation below. This cell is just a marker for the review session; it can be removed later.

In [1]:
# Core libraries and dependencies for text classification models
import csv
import json
import math
import os
import random
import re
import time
from collections import Counter, defaultdict

import numpy as np

In [2]:
# Download training and test datasets from repository
!wget https://raw.githubusercontent.com/pranavagrawaI/anlp-assignment-1/main/Corona_NLP_train.csv
!wget https://raw.githubusercontent.com/pranavagrawaI/anlp-assignment-1/main/Corona_NLP_test.csv

'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
# Configuration and hyperparameters for all models
TRAIN_CSV = "Corona_NLP_train.csv"
TEST_CSV = "Corona_NLP_test.csv"

TRAIN_MODE = True
TEXT_COL = "OriginalTweet"
LABEL_COL = "Sentiment"

RNN_OUT_DIR = "rnn_artifacts"
LSTM_OUT_DIR = "lstm_artifacts"
TRANSFORMERS_OUT_DIR = "transformers_artifacts"
MAX_LEN = 40
EMB_DIM = 100
HIDDEN_DIM = 128
EPOCHS = 5
BATCH_SIZE = 64
LR = 1e-3
WEIGHT_DECAY = 1e-4
DROPOUT_P = 0.1
EARLY_STOP = 3
GRAD_CLIP = 1.0
RNG_SEED = 42

NUM_LAYERS = 1
NUM_HEADS = 2
DIM_FEEDFORWARD = 256

In [4]:
# Text preprocessing, tokenization, and special token handling
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

SPECIAL_TOKENS = [
    "<PAD>",
    "<UNK>",
    "<BOS>",
    "<EOS>",
    "<URL>",
    "<USER>",
    "<NUM>",
    "<EMO_POS>",
    "<EMO_NEG>",
]
PAD, UNK, BOS, EOS, URL_T, USER_T, NUM_T, EMO_POS, EMO_NEG = range(len(SPECIAL_TOKENS))

URL_RE = re.compile(r"https?://\S+|www\.[^\s]+", re.I)
USER_RE = re.compile(r"@[A-Za-z0-9_]+")
NUM_RE = re.compile(r"(?<![A-Za-z])[-+]?\d+[\d,\.]*")
EMO_POS_RE = re.compile(r"[😀😃😄😁😆😊🙂😍😘😺👍❤️♥️💖✨]")
EMO_NEG_RE = re.compile(r"[😞😟😠😡😢😭😔🙁😕👎]")
WS_RE = re.compile(r"\s+")
TOKEN_RE = re.compile(r"[\w\-']+|[^\w\s]")


def normalize_text(s: str) -> str:
    s = s.strip().lower()
    s = URL_RE.sub(" <URL> ", s)
    s = USER_RE.sub(" <USER> ", s)
    s = NUM_RE.sub(" <NUM> ", s)
    s = EMO_POS_RE.sub(" <EMO_POS> ", s)
    s = EMO_NEG_RE.sub(" <EMO_NEG> ", s)
    s = WS_RE.sub(" ", s)
    return s.strip()


def simple_tokenize(s: str):
    return TOKEN_RE.findall(s)


class Tokenizer:
    def __init__(self, max_vocab=30000, min_freq=1):
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.token2id, self.id2token = {}, []

    def fit(self, texts):
        counter = Counter()
        for t in texts:
            counter.update(simple_tokenize(normalize_text(t)))
        for sp in SPECIAL_TOKENS:
            counter[sp] += 10**9
        items = [w for w, c in counter.items() if c >= self.min_freq]
        items.sort(key=lambda w: counter[w], reverse=True)
        items = items[: self.max_vocab]
        self.id2token = items
        self.token2id = {w: i for i, w in enumerate(items)}
        for i, sp in enumerate(SPECIAL_TOKENS):
            self.token2id[sp] = i
            self.id2token[i] = sp

    def encode(self, s, add_bos_eos=True):
        toks = simple_tokenize(normalize_text(s))
        ids = [BOS] if add_bos_eos else []
        for tok in toks:
            if tok in ("<URL>", "<USER>", "<NUM>", "<EMO_POS>", "<EMO_NEG>"):
                ids.append(self.token2id[tok])
            else:
                ids.append(self.token2id.get(tok, UNK))
        if add_bos_eos:
            ids.append(EOS)
        return ids

In [5]:
# Data loading and preprocessing utility functions
def read_csv_text_label(path, text_col, label_col):
    texts, labels = [], []
    with open(path, newline="", encoding="ISO-8859-1") as f:
        reader = csv.DictReader(f)
        for row in reader:
            t, y = row[text_col], row[label_col]
            if t and y:
                texts.append(str(t))
                labels.append(str(y))
    return texts, labels


def pad_sequences(seqs, max_len):
    N = len(seqs)
    arr = np.full((N, max_len), PAD, dtype=np.int32)
    mask = np.zeros((N, max_len), dtype=np.float32)
    for i, s in enumerate(seqs):
        L = min(len(s), max_len)
        arr[i, :L] = s[:L]
        mask[i, :L] = 1.0
    return arr, mask


def stratified_split(labels, val_ratio=0.15, seed=None):
    rng = random.Random(seed) if seed is not None else random
    label2idx = defaultdict(list)
    for i, y in enumerate(labels):
        label2idx[y].append(i)
    train_idx, val_idx = [], []
    for label in sorted(label2idx.keys()):
        idxs = label2idx[label]
        rng.shuffle(idxs)
        k = max(1, int(round(len(idxs) * val_ratio)))
        val_idx += idxs[:k]
        train_idx += idxs[k:]
    rng.shuffle(train_idx)
    rng.shuffle(val_idx)
    return train_idx, val_idx


def build_label_map(labels):
    uniq = sorted(set(labels))
    label2id = {y: i for i, y in enumerate(uniq)}
    id2label = {i: y for i, y in enumerate(uniq)}
    return label2id, id2label

In [6]:
# Embedding layer and vanilla RNN implementation with forward/backward passes
def orthogonal_(shape, gain=1.0):
    a = np.random.randn(*shape).astype(np.float32)
    u, _, v = np.linalg.svd(a, full_matrices=False)
    q = u if u.shape == shape else v
    return (gain * q).astype(np.float32)


class Embedding:
    def __init__(self, vocab_size, dim):
        lim = 1.0 / math.sqrt(dim)
        self.W = np.random.uniform(-lim, lim, (vocab_size, dim)).astype(np.float32)
        self.W[PAD] = 0.0
        self.grad = np.zeros_like(self.W)
        self.last_idx = None

    def forward(self, x_ids):
        self.last_idx = x_ids
        return self.W[x_ids]

    def backward(self, dE):
        self.grad.fill(0.0)
        idx = self.last_idx
        mask_bt1 = (idx != PAD)[..., None].astype(np.float32)
        dE_masked = dE * mask_bt1
        np.add.at(self.grad, idx, dE_masked)
        self.grad[PAD] = 0.0


class RNN:
    def __init__(self, input_dim, hidden_dim):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.Wxh = (
            np.random.randn(input_dim, hidden_dim).astype(np.float32)
            / np.sqrt(input_dim)
        ) * 0.5
        self.Whh = orthogonal_((hidden_dim, hidden_dim), gain=1.0)
        self.bh = np.zeros((hidden_dim,), dtype=np.float32)
        self.dWxh = np.zeros_like(self.Wxh)
        self.dWhh = np.zeros_like(self.Whh)
        self.dbh = np.zeros_like(self.bh)
        self.last_x = None
        self.last_h = None
        self.last_mask = None

    def forward(self, x, mask):
        B, T, D = x.shape
        H = self.hidden_dim
        h = np.zeros((B, T, H), dtype=np.float32)
        h_prev = np.zeros((B, H), dtype=np.float32)
        for t in range(T):
            xt = x[:, t, :]
            pre = xt @ self.Wxh + h_prev @ self.Whh + self.bh
            ht = np.tanh(pre)
            mt = mask[:, t : t + 1]
            ht = mt * ht + (1.0 - mt) * h_prev
            h[:, t, :] = ht
            h_prev = ht
        self.last_x, self.last_h, self.last_mask = x, h, mask
        return h

    def backward(self, dh):
        x, h, mask = self.last_x, self.last_h, self.last_mask
        B, T, D = x.shape
        H = self.hidden_dim
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dbh = np.zeros_like(self.bh)
        dx = np.zeros_like(x)
        dh_next = np.zeros((B, H), dtype=np.float32)
        for t in reversed(range(T)):
            mt = mask[:, t : t + 1]
            ht = h[:, t, :]
            hprev = h[:, t - 1, :] if t > 0 else np.zeros_like(ht)
            xt = x[:, t, :]
            dht = dh[:, t, :] + dh_next
            dht = dht * mt
            pre_grad = (1.0 - ht * ht) * dht
            dWxh += xt.T @ pre_grad
            dWhh += hprev.T @ pre_grad
            dbh += np.sum(pre_grad, axis=0)
            dx[:, t, :] = pre_grad @ self.Wxh.T
            dh_next = pre_grad @ self.Whh.T
        self.dWxh[...] = dWxh
        self.dWhh[...] = dWhh
        self.dbh[...] = dbh
        return dx

In [7]:
# LSTM implementation with forward and backward passes
class LSTM:
    def __init__(self, input_dim, hidden_dim, seed=0):
        rng = np.random.default_rng(seed)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.W_ih = rng.normal(
            0, 1 / np.sqrt(input_dim), (input_dim, 4 * hidden_dim)
        ).astype(np.float32)
        self.W_hh = rng.normal(
            0, 1 / np.sqrt(hidden_dim), (hidden_dim, 4 * hidden_dim)
        ).astype(np.float32)
        self.b = np.zeros(4 * hidden_dim, dtype=np.float32)
        self.b[hidden_dim : 2 * hidden_dim] = 1.0

        self.dW_ih = np.zeros_like(self.W_ih)
        self.dW_hh = np.zeros_like(self.W_hh)
        self.db = np.zeros_like(self.b)

        self.cache = None

    @staticmethod
    def _sigmoid(x):
        pos = x >= 0
        z = np.empty_like(x, dtype=np.float32)
        z[pos] = 1.0 / (1.0 + np.exp(-x[pos]))
        ex = np.exp(x[~pos])
        z[~pos] = ex / (1.0 + ex)
        return z.astype(np.float32)

    def zero_grads(self):
        self.dW_ih.fill(0.0)
        self.dW_hh.fill(0.0)
        self.db.fill(0.0)

    def forward(self, x, mask):
        B, T, D = x.shape
        H = self.hidden_dim
        h_prev = np.zeros((B, H), dtype=np.float32)
        c_prev = np.zeros((B, H), dtype=np.float32)

        h_all = np.zeros((B, T, H), dtype=np.float32)
        self.cache = []

        for t in range(T):
            xt = x[:, t, :]
            mt = mask[:, t : t + 1].astype(np.float32)

            gates = xt @ self.W_ih + h_prev @ self.W_hh + self.b
            i_pre, f_pre, g_pre, o_pre = np.split(gates, 4, axis=1)

            i = self._sigmoid(i_pre)
            f = self._sigmoid(f_pre)
            g = np.tanh(g_pre).astype(np.float32)
            o = self._sigmoid(o_pre)

            c_bar = f * c_prev + i * g
            h_bar = o * np.tanh(c_bar).astype(np.float32)

            c_next = mt * c_bar + (1.0 - mt) * c_prev
            h_next = mt * h_bar + (1.0 - mt) * h_prev

            h_all[:, t, :] = h_next

            self.cache.append((xt, h_prev, c_prev, i, f, g, o, c_bar, h_bar, mt))

            h_prev, c_prev = h_next, c_next

        return h_all

    def backward(self, dh_all):
        assert self.cache is not None, "call forward() first"
        B, T, H = dh_all.shape
        D = self.input_dim

        dx = np.zeros((B, T, D), dtype=np.float32)
        self.zero_grads()

        dh_next = np.zeros((B, H), dtype=np.float32)
        dc_next = np.zeros((B, H), dtype=np.float32)

        for t in reversed(range(T)):
            xt, h_prev, c_prev, i, f, g, o, c_bar, h_bar, mt = self.cache[t]

            dh = dh_all[:, t, :] + dh_next

            dh_bar = dh * mt
            dh_prev_mask = dh * (1.0 - mt)

            tanh_c = np.tanh(c_bar).astype(np.float32)
            do = dh_bar * tanh_c * o * (1.0 - o)

            dc_bar = dh_bar * o * (1.0 - tanh_c**2) + dc_next * mt

            di = dc_bar * g * i * (1.0 - i)
            df = dc_bar * c_prev * f * (1.0 - f)
            dg = dc_bar * i * (1.0 - g**2)

            dc_prev_lstm = dc_bar * f

            dgates = np.concatenate([di, df, dg, do], axis=1).astype(np.float32)

            self.dW_ih += xt.T @ dgates
            self.dW_hh += h_prev.T @ dgates
            self.db += dgates.sum(axis=0)

            dx_t = dgates @ self.W_ih.T
            dh_prev_lstm = dgates @ self.W_hh.T

            dc_prev = dc_prev_lstm + dc_next * (1.0 - mt)

            dh_prev = dh_prev_lstm * mt + dh_prev_mask

            dh_next = dh_prev
            dc_next = dc_prev
            dx[:, t, :] = dx_t

        return dx

    def step_sgd(self, lr=1e-2, weight_decay=0.0):
        if weight_decay != 0.0:
            self.dW_ih += weight_decay * self.W_ih
            self.dW_hh += weight_decay * self.W_hh
        self.W_ih -= lr * self.dW_ih
        self.W_hh -= lr * self.dW_hh
        self.b -= lr * self.db
        self.zero_grads()

In [8]:
# Linear layer, loss functions, optimizer, and training utilities
class Linear:
    def __init__(self, in_dim, out_dim):
        lim = 1.0 / math.sqrt(in_dim)
        self.W = np.random.uniform(-lim, +lim, (in_dim, out_dim)).astype(np.float32)
        self.b = np.zeros((out_dim,), dtype=np.float32)
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        self.last_x = None

    def forward(self, x):
        self.last_x = x.copy()
        return x @ self.W + self.b

    def backward(self, dY):
        if self.last_x is None:
            raise RuntimeError("backward() called before forward()")

        x = self.last_x
        if x.shape[:-1] != dY.shape[:-1]:
            raise ValueError(
                f"Shape mismatch: forward input was {x.shape} but backward gradient is {dY.shape}"
            )

        x_flat = x.reshape(-1, x.shape[-1])
        dY_flat = dY.reshape(-1, dY.shape[-1])

        dW = x_flat.T @ dY_flat
        db = np.sum(dY_flat, axis=0)

        self.dW += dW
        self.db += db

        dx_flat = dY_flat @ self.W.T
        dx = dx_flat.reshape(x.shape)
        return dx

    def zero_grad(self):
        self.dW.fill(0.0)
        self.db.fill(0.0)
        self.last_x = None


def weighted_cross_entropy(logits, y, w):
    B, C = logits.shape
    exp_shift = np.max(logits, axis=1, keepdims=True)
    exp_logits = np.exp(logits - exp_shift)
    Z = np.sum(exp_logits, axis=1, keepdims=True)
    log_probs = (logits - exp_shift) - np.log(Z)

    true_log_probs = log_probs[np.arange(B), y]
    weighted_log_probs = true_log_probs * w[y]
    loss = -np.mean(weighted_log_probs)

    probs = exp_logits / Z
    dlog = probs.copy()
    dlog[np.arange(B), y] -= 1.0
    dlog *= w[y, None]
    dlog /= B
    return float(loss), dlog


def compute_class_weights(y, num_classes):
    counts = np.bincount(y, minlength=num_classes).astype(np.float32)
    counts[counts == 0] = 1.0
    inv = 1.0 / counts
    w = inv * (num_classes / np.sum(inv))
    return w.astype(np.float32)


def classification_report(y_true, y_pred, id2label):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    label_set = sorted(set(y_true.tolist()) | set(y_pred.tolist()))
    lines, macro = [], 0.0
    for c in label_set:
        tp = np.sum((y_true == c) & (y_pred == c))
        fp = np.sum((y_true != c) & (y_pred == c))
        fn = np.sum((y_true == c) & (y_pred != c))
        p = tp / (tp + fp + 1e-12)
        r = tp / (tp + fn + 1e-12)
        f1 = 2 * p * r / (p + r + 1e-12)
        macro += f1
        lines.append(f"{id2label[c]}: P={p:.3f} R={r:.3f} F1={f1:.3f}")
    lines.insert(0, f"Macro-F1={macro / max(1, len(label_set)):.4f}")
    return "\n".join(lines)


class Adam:
    def __init__(
        self, params, grads, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0
    ):
        self.params, self.grads = params, grads
        self.lr, (self.b1, self.b2), self.eps = lr, betas, eps
        self.weight_decay = weight_decay
        self.m = [np.zeros_like(p) for p in params]
        self.v = [np.zeros_like(p) for p in params]
        self.t = 0

    def step(self):
        self.t += 1
        b1t = 1 - self.b1**self.t
        b2t = 1 - self.b2**self.t
        for i, (p, g) in enumerate(zip(self.params, self.grads)):
            g = g + self.weight_decay * p if self.weight_decay != 0.0 else g
            self.m[i] = self.b1 * self.m[i] + (1 - self.b1) * g
            self.v[i] = self.b2 * self.v[i] + (1 - self.b2) * (g * g)
            mhat = self.m[i] / (b1t + 1e-12)
            vhat = self.v[i] / (b2t + 1e-12)
            p -= self.lr * mhat / (np.sqrt(vhat) + self.eps)

    def zero_grad(self):
        for g in self.grads:
            g.fill(0.0)


def batch_iter(X, M, y, batch_size, shuffle=True):
    N = X.shape[0]
    idx = np.arange(N)
    if shuffle:
        np.random.shuffle(idx)
    for i in range(0, N, batch_size):
        j = idx[i : i + batch_size]
        yield X[j], M[j], y[j]


def global_grad_clip(grads, max_norm=1.0):
    total = 0.0
    for g in grads:
        total += float(np.sum(g * g))
    global_norm = math.sqrt(total)
    if global_norm > max_norm:
        factor = max_norm / (global_norm + 1e-12)
        for g in grads:
            g *= factor


class Dropout:
    def __init__(self, p=0.0):
        self.p = p
        self.training = True
        self.last_mask = None

    def forward(self, x):
        if not self.training or self.p == 0.0:
            return x
        keep_prob = 1.0 - self.p
        mask = np.random.binomial(1, keep_prob, x.shape).astype(np.float32)
        self.last_mask = mask
        return x * mask / keep_prob

    def backward(self, dy):
        if not self.training or self.p == 0.0:
            return dy
        keep_prob = 1.0 - self.p
        return dy * self.last_mask / keep_prob

In [9]:
# Transformer components: positional encoding and layer normalization
def create_positional_encoding(max_len, emb_dim):
    pe = np.zeros((max_len, emb_dim), dtype=np.float32)
    position = np.arange(max_len, dtype=np.float32).reshape(-1, 1)

    div_term = np.exp(
        np.arange(0, emb_dim, 2, dtype=np.float32) * -(math.log(10000.0) / emb_dim)
    )

    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)

    return pe


class LayerNorm:
    def __init__(self, dim, eps=1e-5):
        self.dim = dim
        self.eps = eps

        self.gamma = np.ones(dim, dtype=np.float32)
        self.beta = np.zeros(dim, dtype=np.float32)

        self.dgamma = np.zeros_like(self.gamma)
        self.dbeta = np.zeros_like(self.beta)

        self.cache = None

    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(var + self.eps)

        out = self.gamma * x_norm + self.beta

        self.cache = (x, mean, var, x_norm)
        return out

    def backward(self, dy):
        x, mean, var, x_norm = self.cache

        axis = tuple(range(dy.ndim - 1))
        self.dbeta[...] = np.sum(dy, axis=axis)
        self.dgamma[...] = np.sum(dy * x_norm, axis=axis)

        dx_norm = dy * self.gamma
        std_inv = 1.0 / np.sqrt(var + self.eps)

        dvar = np.sum(
            dx_norm * (x - mean) * -0.5 * (std_inv**3), axis=-1, keepdims=True
        )
        dmean = np.sum(-dx_norm * std_inv, axis=-1, keepdims=True) + dvar * np.mean(
            -2.0 * (x - mean), axis=-1, keepdims=True
        )

        dx = (
            (dx_norm * std_inv)
            + (dvar * 2.0 * (x - mean) / x.shape[-1])
            + (dmean / x.shape[-1])
        )

        return dx

In [10]:
# Multi-head attention mechanism for Transformer
def _softmax(x, axis=-1):
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / np.sum(e_x, axis=axis, keepdims=True)


def _dsoftmax(dy, y):
    y_reshaped = y[..., np.newaxis]
    dy_reshaped = dy[..., np.newaxis]

    diag_J = np.einsum("...ij,...ik->...ijk", y, np.eye(y.shape[-1]))
    outer_prod = np.einsum("...i,...j->...ij", y, y)
    J = diag_J - outer_prod

    d_out = y * dy - y * np.sum(y * dy, axis=-1, keepdims=True)
    return d_out


class MultiheadAttention:
    def __init__(self, emb_dim, num_heads):
        assert emb_dim % num_heads == 0, (
            "Embedding dimension must be divisible by num_heads"
        )
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads

        self.in_proj = Linear(emb_dim, 3 * emb_dim)
        self.out_proj = Linear(emb_dim, emb_dim)

        self.cache = None

    def forward(self, x, key_padding_mask):
        B, T, D = x.shape

        qkv = self.in_proj.forward(x)
        q, k, v = np.split(qkv, 3, axis=-1)

        q = q.reshape(B, T, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        k = k.reshape(B, T, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        v = v.reshape(B, T, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)

        scores = (q @ k.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)

        mask_reshaped = key_padding_mask[:, np.newaxis, np.newaxis, :]
        scores = np.where(mask_reshaped == 0, -1e9, scores)

        attn_weights = _softmax(scores, axis=-1)

        context = attn_weights @ v

        context_concatenated = context.transpose(0, 2, 1, 3).reshape(B, T, self.emb_dim)
        output = self.out_proj.forward(context_concatenated)

        self.cache = (x, q, k, v, attn_weights, context_concatenated)

        return output

    def backward(self, dy):
        x, q, k, v, attn_weights, context_concatenated = self.cache
        B, T, D = dy.shape

        d_context_concatenated = self.out_proj.backward(dy)

        d_context = d_context_concatenated.reshape(
            B, T, self.num_heads, self.head_dim
        ).transpose(0, 2, 1, 3)

        d_attn_weights = d_context @ v.transpose(0, 1, 3, 2)
        dv = attn_weights.transpose(0, 1, 3, 2) @ d_context

        d_scores = _dsoftmax(d_attn_weights, attn_weights)

        d_scores /= np.sqrt(self.head_dim)

        dq = d_scores @ k
        dk = d_scores.transpose(0, 1, 3, 2) @ q

        dq = dq.transpose(0, 2, 1, 3).reshape(B, T, self.emb_dim)
        dk = dk.transpose(0, 2, 1, 3).reshape(B, T, self.emb_dim)
        dv = dv.transpose(0, 2, 1, 3).reshape(B, T, self.emb_dim)

        d_qkv = np.concatenate([dq, dk, dv], axis=-1)
        dx = self.in_proj.backward(d_qkv)

        return dx

In [11]:
# Transformer encoder layer with self-attention and feed-forward network
class TransformerEncoderLayer:
    def __init__(self, emb_dim, num_heads, dim_feedforward, dropout_p=0.1):
        self.self_attn = MultiheadAttention(emb_dim, num_heads)
        self.norm1 = LayerNorm(emb_dim)
        self.dropout1 = Dropout(dropout_p)

        self.linear1 = Linear(emb_dim, dim_feedforward)
        self.linear2 = Linear(dim_feedforward, emb_dim)
        self.norm2 = LayerNorm(emb_dim)
        self.dropout2 = Dropout(dropout_p)

        self.cache = {}

    def train(self):
        self.dropout1.training = True
        self.dropout2.training = True

    def eval(self):
        self.dropout1.training = False
        self.dropout2.training = False

    def forward(self, x, mask):
        attn_output = self.self_attn.forward(x, mask)
        attn_output_dropped = self.dropout1.forward(attn_output)
        sublayer1_out = x + attn_output_dropped
        norm1_out = self.norm1.forward(sublayer1_out)

        linear1_out = self.linear1.forward(norm1_out)
        relu_out = np.maximum(0, linear1_out)
        linear2_out = self.linear2.forward(relu_out)
        ffn_output_dropped = self.dropout2.forward(linear2_out)
        sublayer2_out = norm1_out + ffn_output_dropped
        norm2_out = self.norm2.forward(sublayer2_out)

        self.cache["x"] = x
        self.cache["norm1_out"] = norm1_out
        self.cache["relu_out"] = relu_out

        return norm2_out

    def backward(self, dy):
        x = self.cache["x"]
        norm1_out = self.cache["norm1_out"]
        relu_out = self.cache["relu_out"]

        d_sublayer2_out = self.norm2.backward(dy)
        d_norm1_out_from_res = d_sublayer2_out
        d_ffn_output_dropped = d_sublayer2_out

        d_linear2_out = self.dropout2.backward(d_ffn_output_dropped)
        d_relu_out = self.linear2.backward(d_linear2_out)
        d_linear1_out = d_relu_out * (relu_out > 0)
        d_norm1_out_from_ffn = self.linear1.backward(d_linear1_out)

        d_norm1_out = d_norm1_out_from_res + d_norm1_out_from_ffn

        d_sublayer1_out = self.norm1.backward(d_norm1_out)
        dx_from_res = d_sublayer1_out
        d_attn_output_dropped = d_sublayer1_out

        d_attn_output = self.dropout1.backward(d_attn_output_dropped)
        dx_from_attn = self.self_attn.backward(d_attn_output)

        dx = dx_from_res + dx_from_attn

        return dx

In [12]:
# Training loop with early stopping and validation monitoring
def train_model(
    model,
    train_data,
    val_data,
    num_classes,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    lr=1e-3,
    weight_decay=WEIGHT_DECAY,
    early_patience=EARLY_STOP,
    max_grad_norm=1.0,
    id2label=None,
):
    Xtr, Mtr, ytr = train_data
    Xva, Mva, yva = val_data

    class_w = compute_class_weights(ytr, num_classes)
    print("Class counts:", np.bincount(ytr, minlength=num_classes))
    print("Class weights:", class_w)

    params, grads = model.parameters_and_grads()
    opt = Adam(params, grads, lr=lr, weight_decay=weight_decay)

    best_f1, best_state, patience = -1.0, None, early_patience

    for epoch in range(1, epochs + 1):
        model.train()
        opt.zero_grad()
        losses = []
        t0 = time.time()
        num_batches = int(np.ceil(len(Xtr) / batch_size))
        last_gnorm = None

        for b, (xb, mb, yb) in enumerate(
            batch_iter(Xtr, Mtr, ytr, batch_size, shuffle=True), 1
        ):
            logits = model.forward(xb, mb)
            loss, dlog = weighted_cross_entropy(logits, yb, class_w)
            model.backward(dlog)

            gsum = 0.0
            for g in grads:
                gsum += float(np.sum(g * g))
            last_gnorm = math.sqrt(gsum)

            global_grad_clip(grads, max_grad_norm)
            opt.step()
            opt.zero_grad()
            losses.append(loss)

            elapsed = time.time() - t0
            print(
                f"\r  batch {b:>4}/{num_batches} | avg_loss={np.mean(losses):.4f} | grad_norm={last_gnorm:.3e} | {elapsed:.1f}s",
                end="",
                flush=True,
            )

        dur = time.time() - t0
        print(f"\nend-epoch grad_norm={last_gnorm:.3e}")

        model.eval()
        logits = []
        for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
            logits.append(model.forward(xb, mb))
        logits = np.concatenate(logits, axis=0)
        y_pred = np.argmax(logits, axis=1)

        labels = sorted(set(yva.tolist()) | set(y_pred.tolist()))
        f1_sum = 0.0
        for c in labels:
            tp = np.sum((yva == c) & (y_pred == c))
            fp = np.sum((yva != c) & (y_pred == c))
            fn = np.sum((yva == c) & (y_pred != c))
            p = tp / (tp + fp + 1e-12)
            r = tp / (tp + fn + 1e-12)
            f1 = 2 * p * r / (p + r + 1e-12)
            f1_sum += f1
        macro_f1 = f1_sum / max(1, len(labels))

        print(
            f"Epoch {epoch:02d} | loss={np.mean(losses):.4f} | val_macro_f1={macro_f1:.4f} | {dur:.1f}s"
        )

        if macro_f1 > best_f1 + 1e-6:
            best_f1 = macro_f1
            best_state = {
                "emb.W": model.emb.W.copy(),
                "rnn.Wxh": model.rnn.Wxh.copy(),
                "rnn.Whh": model.rnn.Whh.copy(),
                "rnn.bh": model.rnn.bh.copy(),
                "fc.W": model.fc.W.copy(),
                "fc.b": model.fc.b.copy(),
            }
            patience = early_patience
        else:
            patience -= 1
            if patience <= 0:
                print("Early stopping.")
                break

    if best_state is not None:
        model.emb.W[:] = best_state["emb.W"]
        model.rnn.Wxh[:] = best_state["rnn.Wxh"]
        model.rnn.Whh[:] = best_state["rnn.Whh"]
        model.rnn.bh[:] = best_state["rnn.bh"]
        model.fc.W[:] = best_state["fc.W"]
        model.fc.b[:] = best_state["fc.b"]

    model.eval()
    logits = []
    for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
        logits.append(model.forward(xb, mb))
    logits = np.concatenate(logits, axis=0)
    y_pred = np.argmax(logits, axis=1)
    print("\nValidation report:\n" + classification_report(yva, y_pred, id2label))
    return model

In [13]:
# RNN model save and load functions for artifacts and weights
def save_rnn_artifacts(out_dir, tokenizer, label2id, config, model):
    os.makedirs(out_dir, exist_ok=True)

    tok_path = os.path.join(out_dir, "tokenizer.json")
    with open(tok_path, "w") as f:
        json.dump(
            {
                "token2id": tokenizer.token2id,
                "id2token": tokenizer.id2token,
                "max_vocab": tokenizer.max_vocab,
                "min_freq": tokenizer.min_freq,
            },
            f,
        )

    label_path = os.path.join(out_dir, "labels.json")
    with open(label_path, "w") as f:
        json.dump(label2id, f)

    config_path = os.path.join(out_dir, "config.json")
    with open(config_path, "w") as f:
        json.dump(config, f)

    weights_path = os.path.join(out_dir, "weights_rnn_best.npz")
    np.savez(
        weights_path,
        emb_W=model.emb.W,
        rnn_Wxh=model.rnn.Wxh,
        rnn_Whh=model.rnn.Whh,
        rnn_bh=model.rnn.bh,
        fc_W=model.fc.W,
        fc_b=model.fc.b,
    )
    print(f"✅ Saved artifacts to {out_dir}/")


def load_rnn_weights(weights_path, model):
    data = np.load(weights_path)
    model.emb.W[:] = data["emb_W"]
    model.rnn.Wxh[:] = data["rnn_Wxh"]
    model.rnn.Whh[:] = data["rnn_Whh"]
    model.rnn.bh[:] = data["rnn_bh"]
    model.fc.W[:] = data["fc_W"]
    model.fc.b[:] = data["fc_b"]
    print(f"✅ Loaded weights from {weights_path}")

In [14]:
# LSTM model save and load functions for artifacts and weights
def save_lstm_artifacts(out_dir, tokenizer, label2id, config, model):
    os.makedirs(out_dir, exist_ok=True)
    with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf-8") as f:
        json.dump({"token2id": tokenizer.token2id, "id2token": tokenizer.id2token}, f)
    with open(os.path.join(out_dir, "label2id.json"), "w", encoding="utf-8") as f:
        json.dump(label2id, f)
    with open(os.path.join(out_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f)
    np.savez(
        os.path.join(out_dir, "weights_lstm_best.npz"),
        emb_W=model.emb.W,
        lstm_W_ih=model.lstm.W_ih,
        lstm_W_hh=model.lstm.W_hh,
        lstm_b=model.lstm.b,
        fc_W=model.fc.W,
        fc_b=model.fc.b,
    )


def load_lstm_weights(npz_path, model):
    z = np.load(npz_path)
    model.emb.W[:] = z["emb_W"]
    model.lstm.W_ih[:] = z["lstm_W_ih"]
    model.lstm.W_hh[:] = z["lstm_W_hh"]
    model.lstm.b[:] = z["lstm_b"]
    model.fc.W[:] = z["fc_W"]
    model.fc.b[:] = z["fc_b"]

In [15]:
# Transformer model save and load functions for artifacts and weights
def save_transformer_artifacts(out_dir, tokenizer, label2id, config, model):
    os.makedirs(out_dir, exist_ok=True)

    with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf-8") as f:
        json.dump({"token2id": tokenizer.token2id, "id2token": tokenizer.id2token}, f)
    with open(os.path.join(out_dir, "label2id.json"), "w", encoding="utf-8") as f:
        json.dump(label2id, f)
    with open(os.path.join(out_dir, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f)

    weights_to_save = {
        "emb_W": model.emb.W,
        "fc_W": model.fc.W,
        "fc_b": model.fc.b,
    }

    for i, layer in enumerate(model.encoder_layers):
        weights_to_save[f"layer_{i}_attn_in_proj_W"] = layer.self_attn.in_proj.W
        weights_to_save[f"layer_{i}_attn_in_proj_b"] = layer.self_attn.in_proj.b
        weights_to_save[f"layer_{i}_attn_out_proj_W"] = layer.self_attn.out_proj.W
        weights_to_save[f"layer_{i}_attn_out_proj_b"] = layer.self_attn.out_proj.b
        weights_to_save[f"layer_{i}_linear1_W"] = layer.linear1.W
        weights_to_save[f"layer_{i}_linear1_b"] = layer.linear1.b
        weights_to_save[f"layer_{i}_linear2_W"] = layer.linear2.W
        weights_to_save[f"layer_{i}_linear2_b"] = layer.linear2.b
        weights_to_save[f"layer_{i}_norm1_gamma"] = layer.norm1.gamma
        weights_to_save[f"layer_{i}_norm1_beta"] = layer.norm1.beta
        weights_to_save[f"layer_{i}_norm2_gamma"] = layer.norm2.gamma
        weights_to_save[f"layer_{i}_norm2_beta"] = layer.norm2.beta

    np.savez(os.path.join(out_dir, "weights_transformer_best.npz"), **weights_to_save)
    print(f"✅ Saved Transformer artifacts to {out_dir}/")


def load_transformer_weights(npz_path, model):
    z = np.load(npz_path)
    model.emb.W[:] = z["emb_W"]
    model.fc.W[:] = z["fc_W"]
    model.fc.b[:] = z["fc_b"]

    for i, layer in enumerate(model.encoder_layers):
        layer.self_attn.in_proj.W[:] = z[f"layer_{i}_attn_in_proj_W"]
        layer.self_attn.in_proj.b[:] = z[f"layer_{i}_attn_in_proj_b"]
        layer.self_attn.out_proj.W[:] = z[f"layer_{i}_attn_out_proj_W"]
        layer.self_attn.out_proj.b[:] = z[f"layer_{i}_attn_out_proj_b"]
        layer.linear1.W[:] = z[f"layer_{i}_linear1_W"]
        layer.linear1.b[:] = z[f"layer_{i}_linear1_b"]
        layer.linear2.W[:] = z[f"layer_{i}_linear2_W"]
        layer.linear2.b[:] = z[f"layer_{i}_linear2_b"]
        layer.norm1.gamma[:] = z[f"layer_{i}_norm1_gamma"]
        layer.norm1.beta[:] = z[f"layer_{i}_norm1_beta"]
        layer.norm2.gamma[:] = z[f"layer_{i}_norm2_gamma"]
        layer.norm2.beta[:] = z[f"layer_{i}_norm2_beta"]
    print(f"✅ Loaded Transformer weights from {npz_path}")

In [16]:
# RNN-based text classifier with embedding, RNN, and fully connected layers
class RNNClassifier:
    def __init__(
        self, vocab_size, num_classes, emb_dim=100, hidden_dim=256, dropout=0.0
    ):
        self.emb = Embedding(vocab_size, emb_dim)
        self.rnn = RNN(emb_dim, hidden_dim)
        self.drop_emb = Dropout(dropout)
        self.drop_pool = Dropout(dropout)
        self.fc = Linear(hidden_dim, num_classes)
        self.training = True
        self._cache = None

    def parameters_and_grads(self):
        params = [
            self.emb.W,
            self.rnn.Wxh,
            self.rnn.Whh,
            self.rnn.bh,
            self.fc.W,
            self.fc.b,
        ]
        grads = [
            self.emb.grad,
            self.rnn.dWxh,
            self.rnn.dWhh,
            self.rnn.dbh,
            self.fc.dW,
            self.fc.db,
        ]
        return params, grads

    def train(self):
        self.training = True
        self.drop_emb.training = True
        self.drop_pool.training = True

    def eval(self):
        self.training = False
        self.drop_emb.training = False
        self.drop_pool.training = False

    def forward(self, x_ids, mask):
        E = self.emb.forward(x_ids)
        E = self.drop_emb.forward(E)
        H = self.rnn.forward(E, mask)
        lengths = np.sum(mask, axis=1).astype(np.int32)
        last_idx = np.clip(lengths - 1, 0, mask.shape[1] - 1)
        pooled = H[np.arange(H.shape[0]), last_idx]
        pooled = self.drop_pool.forward(pooled)
        logits = self.fc.forward(pooled)
        self._cache = (lengths,)
        return logits

    def backward(self, dlogits):
        dpooled = self.fc.backward(dlogits)
        dpooled = self.drop_pool.backward(dpooled)

        (lengths,) = self._cache
        B, T, H = self.rnn.last_h.shape
        dH = np.zeros_like(self.rnn.last_h)
        last_idx = np.clip(lengths - 1, 0, T - 1)
        dH[np.arange(B), last_idx] = dpooled

        dE = self.rnn.backward(dH)
        dE = self.drop_emb.backward(dE)
        self.emb.backward(dE)

In [17]:
# LSTM-based text classifier with embedding, LSTM, and fully connected layers
class LSTMClassifier:
    def __init__(
        self, vocab_size, num_classes, emb_dim=100, hidden_dim=256, dropout=0.0
    ):
        self.emb = Embedding(vocab_size, emb_dim)
        self.lstm = LSTM(emb_dim, hidden_dim)
        self.drop_emb = Dropout(dropout)
        self.drop_pool = Dropout(dropout)
        self.fc = Linear(hidden_dim, num_classes)
        self.training = True
        self._cache = None

    def parameters_and_grads(self):
        params = [
            self.emb.W,
            self.lstm.W_ih,
            self.lstm.W_hh,
            self.lstm.b,
            self.fc.W,
            self.fc.b,
        ]
        grads = [
            self.emb.grad,
            self.lstm.dW_ih,
            self.lstm.dW_hh,
            self.lstm.db,
            self.fc.dW,
            self.fc.db,
        ]
        return params, grads

    def train(self):
        self.training = True
        self.drop_emb.training = True
        self.drop_pool.training = True

    def eval(self):
        self.training = False
        self.drop_emb.training = False
        self.drop_pool.training = False

    def forward(self, x_ids, mask):
        E = self.emb.forward(x_ids)
        E = self.drop_emb.forward(E)
        H = self.lstm.forward(E, mask)
        lengths = np.sum(mask, axis=1).astype(np.int32)
        last_idx = np.clip(lengths - 1, 0, mask.shape[1] - 1)
        pooled = H[np.arange(H.shape[0]), last_idx]
        pooled = self.drop_pool.forward(pooled)
        logits = self.fc.forward(pooled)
        self._cache = (lengths, H)
        return logits

    def backward(self, dlogits):
        dpooled = self.fc.backward(dlogits)
        dpooled = self.drop_pool.backward(dpooled)

        lengths, H = self._cache
        B, T, hidden_dim = H.shape
        dH = np.zeros_like(H)
        last_idx = np.clip(lengths - 1, 0, T - 1)
        dH[np.arange(B), last_idx] = dpooled

        dE = self.lstm.backward(dH)
        dE = self.drop_emb.backward(dE)
        self.emb.backward(dE)

In [22]:
# LSTM-specific model training function adapted for LSTM attributes
def train_lstm_model(
    model,
    train_data,
    val_data,
    num_classes,
    epochs=10,
    batch_size=32,
    lr=1e-3,
    weight_decay=1e-4,
    early_patience=5,
    max_grad_norm=1.0,
    id2label=None,
):
    Xtr, Mtr, ytr = train_data
    Xva, Mva, yva = val_data

    class_w = compute_class_weights(ytr, num_classes)
    print("Class counts:", np.bincount(ytr, minlength=num_classes))
    print("Class weights:", class_w)

    params, grads = model.parameters_and_grads()
    opt = Adam(params, grads, lr=lr, weight_decay=weight_decay)

    best_f1, best_state, patience = -1.0, None, early_patience

    for epoch in range(1, epochs + 1):
        model.train()
        opt.zero_grad()
        losses = []
        t0 = time.time()
        num_batches = int(np.ceil(len(Xtr) / batch_size))
        last_gnorm = None

        for b, (xb, mb, yb) in enumerate(
            batch_iter(Xtr, Mtr, ytr, batch_size, shuffle=True), 1
        ):
            logits = model.forward(xb, mb)
            loss, dlog = weighted_cross_entropy(logits, yb, class_w)
            model.backward(dlog)

            gsum = 0.0
            for g in grads:
                gsum += float(np.sum(g * g))
            last_gnorm = math.sqrt(gsum)

            global_grad_clip(grads, max_grad_norm)
            opt.step()
            opt.zero_grad()
            losses.append(loss)

            elapsed = time.time() - t0
            print(
                f"\r  batch {b:>4}/{num_batches} | avg_loss={np.mean(losses):.4f} | grad_norm={last_gnorm:.3e} | {elapsed:.1f}s",
                end="",
                flush=True,
            )

        dur = time.time() - t0
        print(f"\nend-epoch grad_norm={last_gnorm:.3e}")

        model.eval()
        logits = []
        for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
            logits.append(model.forward(xb, mb))
        logits = np.concatenate(logits, axis=0)
        y_pred = np.argmax(logits, axis=1)

        labels = sorted(set(yva.tolist()) | set(y_pred.tolist()))
        f1_sum = 0.0
        for c in labels:
            tp = np.sum((yva == c) & (y_pred == c))
            fp = np.sum((yva != c) & (y_pred == c))
            fn = np.sum((yva == c) & (y_pred != c))
            p = tp / (tp + fp + 1e-12)
            r = tp / (tp + fn + 1e-12)
            f1 = 2 * p * r / (p + r + 1e-12)
            f1_sum += f1
        macro_f1 = f1_sum / max(1, len(labels))

        print(
            f"Epoch {epoch:02d} | loss={np.mean(losses):.4f} | val_macro_f1={macro_f1:.4f} | {dur:.1f}s"
        )

        if macro_f1 > best_f1 + 1e-6:
            best_f1 = macro_f1
            best_state = {
                "emb.W": model.emb.W.copy(),
                "lstm.W_ih": model.lstm.W_ih.copy(),
                "lstm.W_hh": model.lstm.W_hh.copy(),
                "lstm.b": model.lstm.b.copy(),
                "fc.W": model.fc.W.copy(),
                "fc.b": model.fc.b.copy(),
            }
            patience = early_patience
        else:
            patience -= 1
            if patience <= 0:
                print("Early stopping.")
                break

    if best_state is not None:
        model.emb.W[:] = best_state["emb.W"]
        model.lstm.W_ih[:] = best_state["lstm.W_ih"]
        model.lstm.W_hh[:] = best_state["lstm.W_hh"]
        model.lstm.b[:] = best_state["lstm.b"]
        model.fc.W[:] = best_state["fc.W"]
        model.fc.b[:] = best_state["fc.b"]

    model.eval()
    logits = []
    for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
        logits.append(model.forward(xb, mb))
    logits = np.concatenate(logits, axis=0)
    y_pred = np.argmax(logits, axis=1)
    print("\nValidation report:\n" + classification_report(yva, y_pred, id2label))
    return model

In [23]:
# LSTM model save and load functions for artifacts and weights
def save_lstm_artifacts(out_dir, tokenizer, label2id, config, model):
    os.makedirs(out_dir, exist_ok=True)

    tok_path = os.path.join(out_dir, "tokenizer.json")
    with open(tok_path, "w") as f:
        json.dump(
            {
                "token2id": tokenizer.token2id,
                "id2token": tokenizer.id2token,
                "max_vocab": tokenizer.max_vocab,
                "min_freq": tokenizer.min_freq,
            },
            f,
        )

    label_path = os.path.join(out_dir, "labels.json")
    with open(label_path, "w") as f:
        json.dump(label2id, f)

    config_path = os.path.join(out_dir, "config.json")
    with open(config_path, "w") as f:
        json.dump(config, f)

    weights_path = os.path.join(out_dir, "weights_lstm_best.npz")
    np.savez(
        weights_path,
        emb_W=model.emb.W,
        lstm_W_ih=model.lstm.W_ih,
        lstm_W_hh=model.lstm.W_hh,
        lstm_b=model.lstm.b,
        fc_W=model.fc.W,
        fc_b=model.fc.b,
    )
    print(f"✅ Saved LSTM artifacts to {out_dir}/")


def load_lstm_weights(weights_path, model):
    data = np.load(weights_path)
    model.emb.W[:] = data["emb_W"]
    model.lstm.W_ih[:] = data["lstm_W_ih"]
    model.lstm.W_hh[:] = data["lstm_W_hh"]
    model.lstm.b[:] = data["lstm_b"]
    model.fc.W[:] = data["fc_W"]
    model.fc.b[:] = data["fc_b"]
    print("✅ Loaded LSTM weights")

In [25]:
# Transformer-specific model training function adapted for Transformer attributes
def train_transformer_model(
    model,
    train_data,
    val_data,
    num_classes,
    epochs=10,
    batch_size=32,
    lr=1e-3,
    weight_decay=1e-4,
    early_patience=5,
    max_grad_norm=1.0,
    id2label=None,
):
    Xtr, Mtr, ytr = train_data
    Xva, Mva, yva = val_data

    class_w = compute_class_weights(ytr, num_classes)
    print("Class counts:", np.bincount(ytr, minlength=num_classes))
    print("Class weights:", class_w)

    params, grads = model.parameters_and_grads()
    opt = Adam(params, grads, lr=lr, weight_decay=weight_decay)

    best_f1, best_state, patience = -1.0, None, early_patience

    for epoch in range(1, epochs + 1):
        model.train()
        opt.zero_grad()
        losses = []
        t0 = time.time()
        num_batches = int(np.ceil(len(Xtr) / batch_size))
        last_gnorm = None

        for b, (xb, mb, yb) in enumerate(
            batch_iter(Xtr, Mtr, ytr, batch_size, shuffle=True), 1
        ):
            logits = model.forward(xb, mb)
            loss, dlog = weighted_cross_entropy(logits, yb, class_w)
            model.backward(dlog)

            gsum = 0.0
            for g in grads:
                gsum += float(np.sum(g * g))
            last_gnorm = math.sqrt(gsum)

            global_grad_clip(grads, max_grad_norm)
            opt.step()
            opt.zero_grad()
            losses.append(loss)

            elapsed = time.time() - t0
            print(
                f"\r  batch {b:>4}/{num_batches} | avg_loss={np.mean(losses):.4f} | grad_norm={last_gnorm:.3e} | {elapsed:.1f}s",
                end="",
                flush=True,
            )

        dur = time.time() - t0
        print(f"\nend-epoch grad_norm={last_gnorm:.3e}")

        model.eval()
        logits = []
        for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
            logits.append(model.forward(xb, mb))
        logits = np.concatenate(logits, axis=0)
        y_pred = np.argmax(logits, axis=1)

        labels = sorted(set(yva.tolist()) | set(y_pred.tolist()))
        f1_sum = 0.0
        for c in labels:
            tp = np.sum((yva == c) & (y_pred == c))
            fp = np.sum((yva != c) & (y_pred == c))
            fn = np.sum((yva == c) & (y_pred != c))
            p = tp / (tp + fp + 1e-12)
            r = tp / (tp + fn + 1e-12)
            f1 = 2 * p * r / (p + r + 1e-12)
            f1_sum += f1
        macro_f1 = f1_sum / max(1, len(labels))

        print(
            f"Epoch {epoch:02d} | loss={np.mean(losses):.4f} | val_macro_f1={macro_f1:.4f} | {dur:.1f}s"
        )

        if macro_f1 > best_f1 + 1e-6:
            best_f1 = macro_f1
            # Save transformer state with all encoder layers
            best_state = {
                "emb.W": model.emb.W.copy(),
                "fc.W": model.fc.W.copy(),
                "fc.b": model.fc.b.copy(),
            }
            
            # Save all encoder layer parameters
            for i, layer in enumerate(model.encoder_layers):
                best_state[f"encoder_{i}.self_attn.in_proj.W"] = layer.self_attn.in_proj.W.copy()
                best_state[f"encoder_{i}.self_attn.in_proj.b"] = layer.self_attn.in_proj.b.copy()
                best_state[f"encoder_{i}.self_attn.out_proj.W"] = layer.self_attn.out_proj.W.copy()
                best_state[f"encoder_{i}.self_attn.out_proj.b"] = layer.self_attn.out_proj.b.copy()
                best_state[f"encoder_{i}.linear1.W"] = layer.linear1.W.copy()
                best_state[f"encoder_{i}.linear1.b"] = layer.linear1.b.copy()
                best_state[f"encoder_{i}.linear2.W"] = layer.linear2.W.copy()
                best_state[f"encoder_{i}.linear2.b"] = layer.linear2.b.copy()
                best_state[f"encoder_{i}.norm1.gamma"] = layer.norm1.gamma.copy()
                best_state[f"encoder_{i}.norm1.beta"] = layer.norm1.beta.copy()
                best_state[f"encoder_{i}.norm2.gamma"] = layer.norm2.gamma.copy()
                best_state[f"encoder_{i}.norm2.beta"] = layer.norm2.beta.copy()
            
            patience = early_patience
        else:
            patience -= 1
            if patience <= 0:
                print("Early stopping.")
                break

    if best_state is not None:
        # Restore best transformer state
        model.emb.W[:] = best_state["emb.W"]
        model.fc.W[:] = best_state["fc.W"]
        model.fc.b[:] = best_state["fc.b"]
        
        # Restore all encoder layer parameters
        for i, layer in enumerate(model.encoder_layers):
            layer.self_attn.in_proj.W[:] = best_state[f"encoder_{i}.self_attn.in_proj.W"]
            layer.self_attn.in_proj.b[:] = best_state[f"encoder_{i}.self_attn.in_proj.b"]
            layer.self_attn.out_proj.W[:] = best_state[f"encoder_{i}.self_attn.out_proj.W"]
            layer.self_attn.out_proj.b[:] = best_state[f"encoder_{i}.self_attn.out_proj.b"]
            layer.linear1.W[:] = best_state[f"encoder_{i}.linear1.W"]
            layer.linear1.b[:] = best_state[f"encoder_{i}.linear1.b"]
            layer.linear2.W[:] = best_state[f"encoder_{i}.linear2.W"]
            layer.linear2.b[:] = best_state[f"encoder_{i}.linear2.b"]
            layer.norm1.gamma[:] = best_state[f"encoder_{i}.norm1.gamma"]
            layer.norm1.beta[:] = best_state[f"encoder_{i}.norm1.beta"]
            layer.norm2.gamma[:] = best_state[f"encoder_{i}.norm2.gamma"]
            layer.norm2.beta[:] = best_state[f"encoder_{i}.norm2.beta"]

    model.eval()
    logits = []
    for xb, mb, yb in batch_iter(Xva, Mva, yva, batch_size=256, shuffle=False):
        logits.append(model.forward(xb, mb))
    logits = np.concatenate(logits, axis=0)
    y_pred = np.argmax(logits, axis=1)
    print("\nValidation report:\n" + classification_report(yva, y_pred, id2label))
    return model

In [26]:
# Transformer model save and load functions for artifacts and weights
def save_transformer_artifacts(out_dir, tokenizer, label2id, config, model):
    os.makedirs(out_dir, exist_ok=True)

    tok_path = os.path.join(out_dir, "tokenizer.json")
    with open(tok_path, "w") as f:
        json.dump(
            {
                "token2id": tokenizer.token2id,
                "id2token": tokenizer.id2token,
                "max_vocab": tokenizer.max_vocab,
                "min_freq": tokenizer.min_freq,
            },
            f,
        )

    label_path = os.path.join(out_dir, "labels.json")
    with open(label_path, "w") as f:
        json.dump(label2id, f)

    config_path = os.path.join(out_dir, "config.json")
    with open(config_path, "w") as f:
        json.dump(config, f)

    weights_path = os.path.join(out_dir, "weights_transformer_best.npz")
    
    # Prepare weights dictionary for all Transformer parameters
    weights_dict = {
        "emb_W": model.emb.W,
        "fc_W": model.fc.W,
        "fc_b": model.fc.b,
        "num_layers": len(model.encoder_layers),
    }
    
    # Save all encoder layer parameters
    for i, layer in enumerate(model.encoder_layers):
        weights_dict[f"encoder_{i}_self_attn_in_proj_W"] = layer.self_attn.in_proj.W
        weights_dict[f"encoder_{i}_self_attn_in_proj_b"] = layer.self_attn.in_proj.b
        weights_dict[f"encoder_{i}_self_attn_out_proj_W"] = layer.self_attn.out_proj.W
        weights_dict[f"encoder_{i}_self_attn_out_proj_b"] = layer.self_attn.out_proj.b
        weights_dict[f"encoder_{i}_linear1_W"] = layer.linear1.W
        weights_dict[f"encoder_{i}_linear1_b"] = layer.linear1.b
        weights_dict[f"encoder_{i}_linear2_W"] = layer.linear2.W
        weights_dict[f"encoder_{i}_linear2_b"] = layer.linear2.b
        weights_dict[f"encoder_{i}_norm1_gamma"] = layer.norm1.gamma
        weights_dict[f"encoder_{i}_norm1_beta"] = layer.norm1.beta
        weights_dict[f"encoder_{i}_norm2_gamma"] = layer.norm2.gamma
        weights_dict[f"encoder_{i}_norm2_beta"] = layer.norm2.beta
    
    np.savez(weights_path, **weights_dict)
    print(f"✅ Saved Transformer artifacts to {out_dir}/")


def load_transformer_weights(weights_path, model):
    data = np.load(weights_path)
    
    # Load basic parameters
    model.emb.W[:] = data["emb_W"]
    model.fc.W[:] = data["fc_W"]
    model.fc.b[:] = data["fc_b"]
    
    # Load encoder layer parameters
    num_layers = int(data["num_layers"])
    for i in range(num_layers):
        if i < len(model.encoder_layers):
            layer = model.encoder_layers[i]
            layer.self_attn.in_proj.W[:] = data[f"encoder_{i}_self_attn_in_proj_W"]
            layer.self_attn.in_proj.b[:] = data[f"encoder_{i}_self_attn_in_proj_b"]
            layer.self_attn.out_proj.W[:] = data[f"encoder_{i}_self_attn_out_proj_W"]
            layer.self_attn.out_proj.b[:] = data[f"encoder_{i}_self_attn_out_proj_b"]
            layer.linear1.W[:] = data[f"encoder_{i}_linear1_W"]
            layer.linear1.b[:] = data[f"encoder_{i}_linear1_b"]
            layer.linear2.W[:] = data[f"encoder_{i}_linear2_W"]
            layer.linear2.b[:] = data[f"encoder_{i}_linear2_b"]
            layer.norm1.gamma[:] = data[f"encoder_{i}_norm1_gamma"]
            layer.norm1.beta[:] = data[f"encoder_{i}_norm1_beta"]
            layer.norm2.gamma[:] = data[f"encoder_{i}_norm2_gamma"]
            layer.norm2.beta[:] = data[f"encoder_{i}_norm2_beta"]
    
    print("✅ Loaded Transformer weights")

In [18]:
# Transformer encoder-based text classifier with positional encoding and multi-layer attention
class TransformerClassifier:
    def __init__(
        self,
        vocab_size,
        num_classes,
        max_len,
        emb_dim,
        num_heads,
        num_layers,
        dim_feedforward,
        dropout_p=0.1,
    ):
        self.emb = Embedding(vocab_size, emb_dim)
        self.pos_encoding = create_positional_encoding(max_len, emb_dim)
        self.emb_dropout = Dropout(dropout_p)

        self.encoder_layers = [
            TransformerEncoderLayer(emb_dim, num_heads, dim_feedforward, dropout_p)
            for _ in range(num_layers)
        ]

        self.fc = Linear(emb_dim, num_classes)

        self.cache = {}

    def train(self):
        self.emb_dropout.training = True
        for layer in self.encoder_layers:
            layer.train()

    def eval(self):
        self.emb_dropout.training = False
        for layer in self.encoder_layers:
            layer.eval()

    def forward(self, x_ids, mask):
        B, T = x_ids.shape

        E = self.emb.forward(x_ids)

        E_pos = E + self.pos_encoding[:T, :]
        E_dropped = self.emb_dropout.forward(E_pos)

        x = E_dropped
        for layer in self.encoder_layers:
            x = layer.forward(x, mask)

        pooled = x[:, 0, :]

        logits = self.fc.forward(pooled)

        self.cache["encoder_output"] = x
        return logits

    def backward(self, dlogits):
        d_pooled = self.fc.backward(dlogits)

        encoder_output = self.cache["encoder_output"]
        dx = np.zeros_like(encoder_output)
        dx[:, 0, :] = d_pooled

        dy = dx
        for layer in reversed(self.encoder_layers):
            dy = layer.backward(dy)

        d_E_pos = self.emb_dropout.backward(dy)

        dE = d_E_pos

        self.emb.backward(dE)

    def parameters_and_grads(self):
        params = [self.emb.W, self.fc.W, self.fc.b]
        grads = [self.emb.grad, self.fc.dW, self.fc.db]

        for layer in self.encoder_layers:
            params.extend([layer.self_attn.in_proj.W, layer.self_attn.in_proj.b])
            params.extend([layer.self_attn.out_proj.W, layer.self_attn.out_proj.b])
            grads.extend([layer.self_attn.in_proj.dW, layer.self_attn.in_proj.db])
            grads.extend([layer.self_attn.out_proj.dW, layer.self_attn.out_proj.db])

            params.extend(
                [layer.linear1.W, layer.linear1.b, layer.linear2.W, layer.linear2.b]
            )
            grads.extend(
                [layer.linear1.dW, layer.linear1.db, layer.linear2.dW, layer.linear2.db]
            )

            params.extend([layer.norm1.gamma, layer.norm1.beta])
            params.extend([layer.norm2.gamma, layer.norm2.beta])
            grads.extend([layer.norm1.dgamma, layer.norm1.dbeta])
            grads.extend([layer.norm2.dgamma, layer.norm2.dbeta])

        return params, grads

In [19]:
# Data preparation and preprocessing setup for all models
tr_texts, tr_labels = read_csv_text_label(TRAIN_CSV, TEXT_COL, LABEL_COL)
te_texts, te_labels = read_csv_text_label(TEST_CSV, TEXT_COL, LABEL_COL)

label2id, id2label = build_label_map(tr_labels)
y_tr_all = np.array([label2id[y] for y in tr_labels], dtype=np.int64)
y_te = np.array([label2id[y] for y in te_labels], dtype=np.int64)

tr_idx, va_idx = stratified_split(tr_labels, val_ratio=0.15, seed=RNG_SEED)
tok = Tokenizer(max_vocab=30000, min_freq=1)
tok.fit([tr_texts[i] for i in tr_idx])

Xtr, Mtr = pad_sequences([tok.encode(tr_texts[i]) for i in tr_idx], MAX_LEN)
Xva, Mva = pad_sequences([tok.encode(tr_texts[i]) for i in va_idx], MAX_LEN)
Xte, Mte = pad_sequences([tok.encode(t) for t in te_texts], MAX_LEN)
ytr, yva = y_tr_all[tr_idx], y_tr_all[va_idx]

num_classes = len(label2id)
vocab_size = len(tok.id2token)
config = dict(
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    max_len=MAX_LEN,
    vocab_size=vocab_size,
    num_classes=num_classes,
)


def eval_split(model, X, M, y):
    logits = []
    for xb, mb, yb in batch_iter(X, M, y, batch_size=256, shuffle=False):
        logits.append(model.forward(xb, mb))
    logits = np.concatenate(logits, axis=0)
    y_pred = np.argmax(logits, axis=1)
    return y_pred

In [20]:
# RNN model training and evaluation pipeline
rnn = RNNClassifier(
    vocab_size=vocab_size,
    num_classes=num_classes,
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT_P,
)

if TRAIN_MODE:
    rnn = train_model(
        rnn,
        (Xtr, Mtr, ytr),
        (Xva, Mva, yva),
        num_classes=num_classes,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR,
        weight_decay=WEIGHT_DECAY,
        early_patience=EARLY_STOP,
        max_grad_norm=GRAD_CLIP,
        id2label=id2label,
    )
    save_rnn_artifacts(RNN_OUT_DIR, tok, label2id, config, rnn)
else:
    weights_path = os.path.join(RNN_OUT_DIR, "weights_rnn_best.npz")
    if os.path.exists(weights_path):
        print("Loading RNN weights from", weights_path)
        load_rnn_weights(weights_path, rnn)
    else:
        print("⚠️ No local weights. You can `!wget` them into", weights_path)

rnn.eval()

val_pred = eval_split(rnn, Xva, Mva, yva)
test_pred = eval_split(rnn, Xte, Mte, y_te)

print("VAL pred counts:", np.bincount(val_pred, minlength=num_classes))
print("VAL true counts:", np.bincount(yva, minlength=num_classes))
print("TEST pred counts:", np.bincount(test_pred, minlength=num_classes))
print("TEST true counts:", np.bincount(y_te, minlength=num_classes))

print("\nValidation report:\n" + classification_report(yva, val_pred, id2label))
print("\nTest Report:\n" + classification_report(y_te, test_pred, id2label))

Class counts: [4659 5630 8429 6556 9709]
Class weights: [1.4002557  1.1587551  0.7739697  0.9950871  0.67193234]
  batch  547/547 | avg_loss=1.3948 | grad_norm=1.281e+00 | 82.3s
end-epoch grad_norm=1.281e+00

end-epoch grad_norm=1.281e+00
Epoch 01 | loss=1.3948 | val_macro_f1=0.2744 | 82.3s
  batch    1/547 | avg_loss=1.3462 | grad_norm=6.203e+00 | 0.1sEpoch 01 | loss=1.3948 | val_macro_f1=0.2744 | 82.3s
  batch  547/547 | avg_loss=1.2630 | grad_norm=1.707e+00 | 101.3s
end-epoch grad_norm=1.707e+00

end-epoch grad_norm=1.707e+00
Epoch 02 | loss=1.2630 | val_macro_f1=0.3219 | 101.3s
Epoch 02 | loss=1.2630 | val_macro_f1=0.3219 | 101.3s
  batch  547/547 | avg_loss=1.1549 | grad_norm=7.296e+00 | 79.4s
end-epoch grad_norm=7.296e+00

end-epoch grad_norm=7.296e+00
Epoch 03 | loss=1.1549 | val_macro_f1=0.4322 | 79.4s
  batch    1/547 | avg_loss=0.7798 | grad_norm=4.222e+00 | 0.2sEpoch 03 | loss=1.1549 | val_macro_f1=0.4322 | 79.4s
  batch  547/547 | avg_loss=0.9263 | grad_norm=6.848e+00 | 133

In [29]:
EPOCHS = 3

# LSTM model training and evaluation pipeline
lstm = LSTMClassifier(
    vocab_size=vocab_size,
    num_classes=num_classes,
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT_P,
)

if TRAIN_MODE:
    lstm = train_lstm_model(
        lstm,
        (Xtr, Mtr, ytr),
        (Xva, Mva, yva),
        num_classes=num_classes,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR,
        weight_decay=WEIGHT_DECAY,
        early_patience=EARLY_STOP,
        max_grad_norm=GRAD_CLIP,
        id2label=id2label,
    )
    save_lstm_artifacts(LSTM_OUT_DIR, tok, label2id, config, lstm)
else:
    weights_path = os.path.join(LSTM_OUT_DIR, "weights_lstm_best.npz")
    if os.path.exists(weights_path):
        load_lstm_weights(weights_path, lstm)
        print(f"Loaded LSTM weights from {weights_path}")
    else:
        print("⚠️ No local LSTM weights found. Training required or download weights.")

lstm.eval()

print("Evaluating LSTM classifier...")
val_pred_lstm = eval_split(lstm, Xva, Mva, yva)
test_pred_lstm = eval_split(lstm, Xte, Mte, y_te)

print("\n=== LSTM Results ===")
print("VAL pred counts:", np.bincount(val_pred_lstm, minlength=num_classes))
print("VAL true counts:", np.bincount(yva, minlength=num_classes))
print("TEST pred counts:", np.bincount(test_pred_lstm, minlength=num_classes))
print("TEST true counts:", np.bincount(y_te, minlength=num_classes))

print(
    "\nLSTM Validation Report:\n" + classification_report(yva, val_pred_lstm, id2label)
)
print("\nLSTM Test Report:\n" + classification_report(y_te, test_pred_lstm, id2label))

Class counts: [4659 5630 8429 6556 9709]
Class weights: [1.4002557  1.1587551  0.7739697  0.9950871  0.67193234]
  batch  547/547 | avg_loss=1.1504 | grad_norm=1.639e+00 | 165.2s
end-epoch grad_norm=1.639e+00

end-epoch grad_norm=1.639e+00
Epoch 01 | loss=1.1504 | val_macro_f1=0.6022 | 165.2s
  batch    1/547 | avg_loss=0.8132 | grad_norm=1.317e+00 | 0.2sEpoch 01 | loss=1.1504 | val_macro_f1=0.6022 | 165.2s
  batch  547/547 | avg_loss=0.7671 | grad_norm=2.132e+00 | 204.4s
end-epoch grad_norm=2.132e+00

end-epoch grad_norm=2.132e+00
Epoch 02 | loss=0.7671 | val_macro_f1=0.6409 | 204.4s
Epoch 02 | loss=0.7671 | val_macro_f1=0.6409 | 204.4s
  batch  547/547 | avg_loss=0.6533 | grad_norm=1.978e+00 | 326.8s
end-epoch grad_norm=1.978e+00

end-epoch grad_norm=1.978e+00
Epoch 03 | loss=0.6533 | val_macro_f1=0.6906 | 326.9s
Epoch 03 | loss=0.6533 | val_macro_f1=0.6906 | 326.9s

Validation report:
Macro-F1=0.6906
Extremely Negative: P=0.627 R=0.775 F1=0.693
Extremely Positive: P=0.755 R=0.648 F1

In [27]:
# Transformer model training and evaluation pipeline
config["num_layers"] = NUM_LAYERS
config["num_heads"] = NUM_HEADS
config["dim_feedforward"] = DIM_FEEDFORWARD
EPOCHS = 2

transformer = TransformerClassifier(
    vocab_size=vocab_size,
    num_classes=num_classes,
    max_len=MAX_LEN,
    emb_dim=EMB_DIM,
    num_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    dim_feedforward=DIM_FEEDFORWARD,
    dropout_p=DROPOUT_P,
)

if TRAIN_MODE:
    print("🚀 Starting Transformer model training...")
    transformer = train_transformer_model(
        transformer,
        (Xtr, Mtr, ytr),
        (Xva, Mva, yva),
        num_classes=num_classes,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR,
        weight_decay=WEIGHT_DECAY,
        early_patience=EARLY_STOP,
        max_grad_norm=GRAD_CLIP,
        id2label=id2label,
    )
    save_transformer_artifacts(TRANSFORMERS_OUT_DIR, tok, label2id, config, transformer)
else:
    weights_path = os.path.join(TRANSFORMERS_OUT_DIR, "weights_transformer_best.npz")
    if os.path.exists(weights_path):
        load_transformer_weights(weights_path, transformer)
    else:
        print(f"⚠️ No local Transformer weights found at {weights_path}.")

transformer.eval()

print("\nEvaluating Transformer classifier...")
val_pred_transformer = eval_split(transformer, Xva, Mva, yva)
test_pred_transformer = eval_split(transformer, Xte, Mte, y_te)

print("\n" + "=" * 20 + " TRANSFORMER RESULTS " + "=" * 20)
print(
    "\nValidation Report (Transformer):\n"
    + classification_report(yva, val_pred_transformer, id2label)
)
print(
    "\nTest Report (Transformer):\n"
    + classification_report(y_te, test_pred_transformer, id2label)
)

🚀 Starting Transformer model training...
Class counts: [4659 5630 8429 6556 9709]
Class weights: [1.4002557  1.1587551  0.7739697  0.9950871  0.67193234]
  batch  547/547 | avg_loss=1.3452 | grad_norm=1.475e+00 | 326.4s
end-epoch grad_norm=1.475e+00

end-epoch grad_norm=1.475e+00
Epoch 01 | loss=1.3452 | val_macro_f1=0.5283 | 326.4s
Epoch 01 | loss=1.3452 | val_macro_f1=0.5283 | 326.4s
  batch  547/547 | avg_loss=0.9588 | grad_norm=1.257e+00 | 319.3s
end-epoch grad_norm=1.257e+00

end-epoch grad_norm=1.257e+00
Epoch 02 | loss=0.9588 | val_macro_f1=0.6365 | 319.3s
Epoch 02 | loss=0.9588 | val_macro_f1=0.6365 | 319.3s

Validation report:
Macro-F1=0.6365
Extremely Negative: P=0.488 R=0.751 F1=0.592
Extremely Positive: P=0.739 R=0.553 F1=0.633
Negative: P=0.684 R=0.421 F1=0.521
Neutral: P=0.726 R=0.872 F1=0.793
Positive: P=0.619 R=0.673 F1=0.644
✅ Saved Transformer artifacts to transformers_artifacts/

Evaluating Transformer classifier...

Validation report:
Macro-F1=0.6365
Extremely Negat

In [30]:
# Model comparison and sample inference demonstration
def extract_macro(s):
    try:
        line = s.splitlines()[0]
        return float(line.split("=")[1])
    except:
        return 0.0


def softmax(logits):
    if logits.ndim == 1:
        exp_logits = np.exp(logits - np.max(logits))
        return exp_logits / np.sum(exp_logits)
    else:
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)


def run_inference_showcase():
    print("=" * 60)
    print("Model Inference Showcase")
    print("=" * 60)

    available_models = {}
    model_candidates = {"RNN": "rnn", "LSTM": "lstm", "Transformer": "transformer"}

    for name, var_name in model_candidates.items():
        if var_name in globals():
            model_instance = globals()[var_name]
            model_instance.eval()
            available_models[name] = model_instance

    if not available_models:
        print("\nERROR: No trained models (rnn, lstm, transformer) found.")
        print("Please ensure the models are trained and available in the global scope.")
        return

    print(
        f"\nFound {len(available_models)} models for evaluation: {', '.join(available_models.keys())}"
    )

    print("\n" + "-" * 60)
    print("Performance Comparison (Test Set Macro F1-Score)")
    print("-" * 60)

    scores = {}
    for name, model in available_models.items():
        test_pred = eval_split(model, Xte, Mte, y_te)
        test_report = classification_report(y_te, test_pred, id2label)
        test_f1 = extract_macro(test_report)
        scores[name] = test_f1
        print(f"{name:<12} | Macro F1: {test_f1:.4f}")

    print("\n" + "-" * 60)
    print("Sample Inference on Two Random Test Examples")
    print("-" * 60)

    if len(te_texts) < 2:
        print("\nERROR: Not enough text samples in the test set to compare.")
        return

    sample_indices = random.sample(range(len(te_texts)), 2)

    for i, idx in enumerate(sample_indices):
        text = te_texts[idx]
        true_label = id2label[y_te[idx]]

        print(f"\n--- Sample {i + 1} (Index: {idx}) ---")
        print(f"Text:       '{text[:100]}{'...' if len(text) > 100 else ''}'")
        print(f"True Label: {true_label}")
        print("Predictions:")

        tokens = tok.encode(text)
        x_single = np.array([tokens[:MAX_LEN]], dtype=np.int32)
        m_single = np.ones_like(x_single, dtype=np.float32)
        m_single[x_single == PAD] = 0.0

        for name, model in available_models.items():
            logits = model.forward(x_single, m_single)[0]
            probabilities = softmax(logits)
            predicted_class_idx = np.argmax(probabilities)
            predicted_label = id2label[predicted_class_idx]
            confidence = probabilities[predicted_class_idx]

            status = "Correct" if predicted_label == true_label else "Incorrect"
            print(
                f"  - {name:<12} -> {predicted_label:<15} (Conf: {confidence:.2f}) [{status}]"
            )

    print("\n" + "=" * 60)
    print("Showcase complete.")
    print("=" * 60)


run_inference_showcase()

Model Inference Showcase

Found 3 models for evaluation: RNN, LSTM, Transformer

------------------------------------------------------------
Performance Comparison (Test Set Macro F1-Score)
------------------------------------------------------------
RNN          | Macro F1: 0.5289
RNN          | Macro F1: 0.5289
LSTM         | Macro F1: 0.6562
LSTM         | Macro F1: 0.6562
Transformer  | Macro F1: 0.5892

------------------------------------------------------------
Sample Inference on Two Random Test Examples
------------------------------------------------------------

--- Sample 1 (Index: 2619) ---
Text:       'COVID-19.....NO ANXIETY AND NO PANIC DO NOT FEAR..... FOLLOW PHYSICAL HYGIENE.....EAT HOT  COOKED FO...'
True Label: Extremely Negative
Predictions:
  - RNN          -> Extremely Negative (Conf: 0.90) [Correct]
  - LSTM         -> Extremely Negative (Conf: 0.98) [Correct]
  - Transformer  -> Extremely Negative (Conf: 0.80) [Correct]

--- Sample 2 (Index: 456) ---
Text:    