# Table Structure Evaluation – Sentence-Transformers

Use sentence embeddings for ground-truth and generated tables, then train regressors to predict similarity.

- Dataset: `rayhu/table-extraction-evaluation` ([Hugging Face dataset](https://huggingface.co/datasets/rayhu/table-extraction-evaluation))
- Embedding: `sentence-transformers` models (configurable)


In [None]:
import sys
print(sys.executable)
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1" 
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [2]:
# Config
SEED = 42
VAL_FRAC = 0.5
LIMIT = None
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
COMBINE = "concat-diff-prod"  # "concat" | "concat-diff-prod"

from datasets import load_dataset
import numpy as np
import torch
import random

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x120ca4a30>

In [3]:
# Load dataset and split

ds = load_dataset("rayhu/table-extraction-evaluation")
train_ds = ds["train"]
split = ds["test"].train_test_split(test_size=1-VAL_FRAC, seed=SEED)
val_ds = split["train"]
test_ds = split["test"]

if LIMIT is not None:
    train_ds = train_ds.select(range(min(LIMIT, len(train_ds))))
    val_ds = val_ds.select(range(min(max(1, LIMIT//5), len(val_ds))))
    test_ds = test_ds.select(range(min(max(1, LIMIT//5), len(test_ds))))

print("train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))


train: 11971 val: 1500 test: 1500


In [4]:
# Text rendering

def render_structure_string(cells, include_text=True):
    parts = []
    for c in cells:
        pos = f"r{c.get('start_row',0)}c{c.get('start_col',0)}-r{c.get('end_row',0)}c{c.get('end_col',0)}"
        if include_text:
            content = " ".join(c.get("content", [])[:5])
            parts.append(f"{pos}:{content}")
        else:
            parts.append(pos)
    return " | ".join(parts[:512])


def to_pair_text(example):
    gt = render_structure_string(example["ground_truth"]["cells"], True)
    pr = render_structure_string(example["generated"]["cells"], True)
    return gt, pr, float(example["similarity_score"])


In [None]:
# Encode with Sentence-Transformers
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

model = SentenceTransformer(MODEL_NAME)


def build_embeddings(split_ds):
    texts_gt = []
    texts_pr = []
    ys = []
    for ex in split_ds:
        gt, pr, y = to_pair_text(ex)
        texts_gt.append(gt)
        texts_pr.append(pr)
        ys.append(y)
    emb_gt = model.encode(texts_gt, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    emb_pr = model.encode(texts_pr, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    if COMBINE == "concat":
        X = np.concatenate([emb_gt, emb_pr], axis=1)
    else:
        X = np.concatenate([emb_gt, emb_pr, np.abs(emb_gt - emb_pr), emb_gt * emb_pr], axis=1)
    y = np.array(ys, dtype=np.float32)
    return X, y

X_train, y_train = build_embeddings(train_ds)
X_val, y_val = build_embeddings(val_ds)
X_test, y_test = build_embeddings(test_ds)

print(X_train.shape, X_val.shape, X_test.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  [2m2025-10-31T06:59:25.000920Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x1077004a0>), traceback: Some(<traceback object at 0x157493700>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28



In [None]:
# Train regressors (Ridge baseline)
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)

val_pred = reg.predict(X_val)
val_rmse = math.sqrt(mean_squared_error(y_val, val_pred))
val_mae = mean_absolute_error(y_val, val_pred)
print({"val_RMSE": val_rmse, "val_MAE": val_mae})

test_pred = reg.predict(X_test)
rmse = math.sqrt(mean_squared_error(y_test, test_pred))
mae = mean_absolute_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)
print({"RMSE": rmse, "MAE": mae, "R2": r2})


In [None]:
# Save artifacts
import os, time, joblib
from pathlib import Path

run_dir = Path("experiments") / f"run_st_{int(time.time())}"
run_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(reg, run_dir / "regressor.joblib")
with open(run_dir / "metrics.json", "w") as f:
    json.dump({"RMSE": float(rmse), "MAE": float(mae), "R2": float(r2)}, f)
with open(run_dir / "config.json", "w") as f:
    json.dump({
        "MODEL_NAME": MODEL_NAME,
        "COMBINE": COMBINE,
        "SEED": SEED
    }, f, indent=2)

print("Saved to:", run_dir)
