# Table Structure Evaluation – PyTorch MLP

This notebook trains a regression model that predicts the similarity score (0–1) for a generated table JSON vs ground truth.

- Dataset: `rayhu/table-extraction-evaluation` ([Hugging Face dataset](https://huggingface.co/datasets/rayhu/table-extraction-evaluation))
- Model: PyTorch MLP
- Representation modes:
  - structured: numeric features from JSON structure
  - embed: optional lightweight text embedding via averaging token vectors (HF backbone not required for baseline)
  - hybrid: concatenate structured + embed



In [14]:
import sys
print(sys.executable)

/Users/rayhu/play/ai/cs230-evaluation-model/.venv/bin/python3.13


In [15]:
# Config
SEED = 42
VAL_FRAC = 0.5  # fraction of the original test split to use as validation (rest is test)
LIMIT = None     # e.g., 2000 for quick smoke tests, or None for full
REPRESENTATION_MODE = "structured"  # "structured" | "embed" | "hybrid"

# MLP defaults
HIDDEN_SIZES = [256, 256]
DROPOUT = 0.1
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-3
BATCH_SIZE = 128
EPOCHS = 10
EARLY_STOP_PATIENCE = 3


In [16]:
# Config
SEED = 42
VAL_FRAC = 0.5  # fraction of the original test split to use as validation (rest is test)
LIMIT = None     # e.g., 2000 for quick smoke tests, or None for full
REPRESENTATION_MODE = "structured"  # "structured" | "embed" | "hybrid"

# MLP defaults
HIDDEN_SIZES = [256, 256]
DROPOUT = 0.1
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-3
BATCH_SIZE = 128
EPOCHS = 10
EARLY_STOP_PATIENCE = 3


In [17]:
# Imports
import math
import json
import random
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datasets import load_dataset

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print("Device:", device)


Device: mps


In [18]:
# Data loading and split from Hugging Face

ds = load_dataset("rayhu/table-extraction-evaluation")
train_ds = ds["train"]
# Split original test into val/test
split = train_test = ds["test"].train_test_split(test_size=1-VAL_FRAC, seed=SEED)
val_ds = split["train"]
test_ds = split["test"]

# Optional LIMIT for quick runs
if LIMIT is not None:
    train_ds = train_ds.select(range(min(LIMIT, len(train_ds))))
    val_ds = val_ds.select(range(min(max(1, LIMIT//5), len(val_ds))))
    test_ds = test_ds.select(range(min(max(1, LIMIT//5), len(test_ds))))

print("train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))


train: 11971 val: 1500 test: 1500


In [19]:
# Feature engineering

def table_dims(cells: List[Dict[str, Any]]) -> Tuple[int, int]:
    if not cells:
        return 0, 0
    max_row = 0
    max_col = 0
    for c in cells:
        max_row = max(max_row, int(c.get("end_row", 0)))
        max_col = max(max_col, int(c.get("end_col", 0)))
    return max_row + 1, max_col + 1


def table_stats(cells: List[Dict[str, Any]]) -> Dict[str, float]:
    rows, cols = table_dims(cells)
    num_cells = len(cells)
    spans = []
    for c in cells:
        r = int(c.get("end_row", 0)) - int(c.get("start_row", 0)) + 1
        cc = int(c.get("end_col", 0)) - int(c.get("start_col", 0)) + 1
        spans.append(r * cc)
    avg_span = float(np.mean(spans)) if spans else 0.0
    max_span = float(np.max(spans)) if spans else 0.0
    return {
        "rows": rows,
        "cols": cols,
        "num_cells": float(num_cells),
        "avg_span": avg_span,
        "max_span": max_span,
    }


def build_structured_features(example: Dict[str, Any]) -> Tuple[np.ndarray, List[str]]:
    gt_cells = example["ground_truth"]["cells"]
    gen_cells = example["generated"]["cells"]
    gt = table_stats(gt_cells)
    pr = table_stats(gen_cells)
    feats = {}
    for k in ["rows", "cols", "num_cells", "avg_span", "max_span"]:
        feats[f"gt_{k}"] = gt[k]
        feats[f"pred_{k}"] = pr[k]
        feats[f"delta_{k}"] = pr[k] - gt[k]
        feats[f"ratio_{k}"] = (pr[k] / gt[k]) if gt[k] not in (0, 0.0) else 0.0
    names = list(feats.keys())
    return np.array([feats[k] for k in names], dtype=np.float32), names


def render_structure_string(cells: List[Dict[str, Any]], include_text: bool = True) -> str:
    parts = []
    for c in cells:
        pos = f"r{c.get('start_row',0)}c{c.get('start_col',0)}-r{c.get('end_row',0)}c{c.get('end_col',0)}"
        if include_text:
            content = " ".join(c.get("content", [])[:5])  # cap for length
            parts.append(f"{pos}:{content}")
        else:
            parts.append(pos)
    return " | ".join(parts[:512])  # truncate


In [20]:
# Build features matrices

def build_features(split_ds):
    Xs = []
    y = []
    for ex in split_ds:
        x_struct, names = build_structured_features(ex)
        if REPRESENTATION_MODE == "structured":
            feats = x_struct
        else:
            # simple text embedding via hashing averaging (no heavy models)
            gt_txt = render_structure_string(ex["ground_truth"]["cells"], include_text=True)
            pr_txt = render_structure_string(ex["generated"]["cells"], include_text=True)
            # very light bag-of-ngrams via hashing to fixed dims
            dims = 512
            vec = np.zeros(dims, dtype=np.float32)
            for s in [gt_txt, pr_txt]:
                for tok in s.split():
                    h = hash(tok) % dims
                    vec[h] += 1.0
            if REPRESENTATION_MODE == "embed":
                feats = vec
            else:  # hybrid
                feats = np.concatenate([x_struct, vec], axis=0)
        Xs.append(feats)
        y.append(float(ex["similarity_score"]))
    X = np.stack(Xs)
    y = np.array(y, dtype=np.float32)
    return X, y

X_train, y_train = build_features(train_ds)
X_val, y_val = build_features(val_ds)
X_test, y_test = build_features(test_ds)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train.shape, X_val.shape, X_test.shape


((11971, 20), (1500, 20), (1500, 20))

In [21]:
# Dataset and DataLoader

class NpDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds_np = NpDataset(X_train, y_train)
val_ds_np = NpDataset(X_val, y_val)
test_ds_np = NpDataset(X_test, y_test)

train_loader = DataLoader(train_ds_np, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds_np, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds_np, batch_size=BATCH_SIZE, shuffle=False)


In [22]:
# Model

class MLP(nn.Module):
    def __init__(self, in_dim: int, hidden: List[int], dropout: float = 0.1):
        super().__init__()
        layers = []
        d = in_dim
        for h in hidden:
            layers += [nn.Linear(d, h), nn.ReLU(), nn.Dropout(dropout), nn.LayerNorm(h)]
            d = h
        layers += [nn.Linear(d, 1)]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x).squeeze(-1)

model = MLP(X_train.shape[1], HIDDEN_SIZES, DROPOUT).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
crit = nn.MSELoss()


In [23]:
# Training loop with early stopping

best_val = float("inf")
pat = 0
for epoch in range(EPOCHS):
    model.train()
    tr_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        opt.zero_grad()
        pred = model(xb)
        loss = crit(pred, yb)
        loss.backward()
        opt.step()
        tr_loss += loss.item() * xb.size(0)
    tr_loss /= len(train_loader.dataset)

    model.eval()
    with torch.no_grad():
        val_preds = []
        val_targets = []
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            pred = model(xb)
            val_preds.append(pred.cpu().numpy())
            val_targets.append(yb.cpu().numpy())
    val_preds = np.concatenate(val_preds)
    val_targets = np.concatenate(val_targets)
    val_rmse = math.sqrt(mean_squared_error(val_targets, val_preds))
    print(f"Epoch {epoch+1}/{EPOCHS} - train_loss={tr_loss:.4f} val_RMSE={val_rmse:.4f}")

    if val_rmse < best_val - 1e-4:
        best_val = val_rmse
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1
        if pat >= EARLY_STOP_PATIENCE:
            print("Early stopping.")
            break

# Load best
model.load_state_dict(best_state)


Epoch 1/10 - train_loss=0.0571 val_RMSE=0.0606
Epoch 2/10 - train_loss=0.0079 val_RMSE=0.0557
Epoch 3/10 - train_loss=0.0050 val_RMSE=0.0501
Epoch 4/10 - train_loss=0.0038 val_RMSE=0.0520
Epoch 5/10 - train_loss=0.0030 val_RMSE=0.0435
Epoch 6/10 - train_loss=0.0026 val_RMSE=0.0387
Epoch 7/10 - train_loss=0.0022 val_RMSE=0.0397
Epoch 8/10 - train_loss=0.0020 val_RMSE=0.0436
Epoch 9/10 - train_loss=0.0019 val_RMSE=0.0346
Epoch 10/10 - train_loss=0.0017 val_RMSE=0.0387


<All keys matched successfully>

In [24]:
# Evaluation on test
model.eval()
with torch.no_grad():
    test_preds = []
    test_targets = []
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = model(xb)
        test_preds.append(pred.cpu().numpy())
        test_targets.append(yb.cpu().numpy())

test_preds = np.concatenate(test_preds)
test_targets = np.concatenate(test_targets)

rmse = math.sqrt(mean_squared_error(test_targets, test_preds))
mae = mean_absolute_error(test_targets, test_preds)
r2 = r2_score(test_targets, test_preds)
print({"RMSE": rmse, "MAE": mae, "R2": r2})


{'RMSE': 0.03804979535151119, 'MAE': 0.029006972908973694, 'R2': 0.8095035552978516}


In [25]:
# Save artifacts
import os, time, joblib
from pathlib import Path

run_dir = Path("experiments") / f"run_mlp_{int(time.time())}"
run_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(scaler, run_dir / "scaler.joblib")
torch.save(model.state_dict(), run_dir / "model.pt")
with open(run_dir / "metrics.json", "w") as f:
    json.dump({"RMSE": float(rmse), "MAE": float(mae), "R2": float(r2)}, f)
with open(run_dir / "config.json", "w") as f:
    json.dump({
        "REPRESENTATION_MODE": REPRESENTATION_MODE,
        "HIDDEN_SIZES": HIDDEN_SIZES,
        "DROPOUT": DROPOUT,
        "LEARNING_RATE": LEARNING_RATE,
        "WEIGHT_DECAY": WEIGHT_DECAY,
        "BATCH_SIZE": BATCH_SIZE,
        "EPOCHS": EPOCHS,
        "SEED": SEED
    }, f, indent=2)

print("Saved to:", run_dir)


Saved to: experiments/run_mlp_1761893738
