# Table Structure Evaluation – Hugging Face Transformers

Fine-tune a transformer encoder for regression to predict table structure similarity.

- Dataset: `rayhu/table-extraction-evaluation` ([Hugging Face dataset](https://huggingface.co/datasets/rayhu/table-extraction-evaluation))
- Model: DistilBERT (configurable) + pooling + regression head



In [None]:
import sys
print(sys.executable)
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [2]:
# Config
SEED = 42
VAL_FRAC = 0.5
LIMIT = None
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 512
LR = 2e-5
EPOCHS = 3
TRAIN_BATCH = 16
EVAL_BATCH = 32


In [3]:
# Imports
from datasets import load_dataset
import numpy as np
import evaluate
from transformers import (AutoTokenizer, AutoModel, AutoConfig, Trainer, TrainingArguments,
                          DataCollatorWithPadding, AutoModelForSequenceClassification)
import torch
import random

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print("Device:", device)


Device: mps


In [4]:
# Load dataset and split

ds = load_dataset("rayhu/table-extraction-evaluation")
train_ds = ds["train"]
split = ds["test"].train_test_split(test_size=1-VAL_FRAC, seed=SEED)
val_ds = split["train"]
test_ds = split["test"]

if LIMIT is not None:
    train_ds = train_ds.select(range(min(LIMIT, len(train_ds))))
    val_ds = val_ds.select(range(min(max(1, LIMIT//5), len(val_ds))))
    test_ds = test_ds.select(range(min(max(1, LIMIT//5), len(test_ds))))

print("train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))


train: 11971 val: 1500 test: 1500


In [5]:
# String rendering for structure

def render_structure_string(cells, include_text=True):
    parts = []
    for c in cells:
        pos = f"r{c.get('start_row',0)}c{c.get('start_col',0)}-r{c.get('end_row',0)}c{c.get('end_col',0)}"
        if include_text:
            content = " ".join(c.get("content", [])[:5])
            parts.append(f"{pos}:{content}")
        else:
            parts.append(pos)
    return " | ".join(parts[:512])


def to_text(example):
    gt = render_structure_string(example["ground_truth"]["cells"], True)
    pr = render_structure_string(example["generated"]["cells"], True)
    return {"text": gt + " [SEP] " + pr, "label": float(example["similarity_score"]) }

train_txt = train_ds.map(to_text)
val_txt = val_ds.map(to_text)
test_txt = test_ds.map(to_text)


Map:   0%|          | 0/11971 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# Tokenization and model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tok_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

train_tok = train_txt.map(tok_fn, batched=True, remove_columns=train_txt.column_names)
val_tok = val_txt.map(tok_fn, batched=True, remove_columns=val_txt.column_names)
test_tok = test_txt.map(tok_fn, batched=True, remove_columns=test_txt.column_names)

# Regression head via sequence classification with 1 label
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type="regression")

data_collator = DataCollatorWithPadding(tokenizer)

metric_mae = evaluate.load("mae")
metric_rmse = evaluate.load("rmse")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.reshape(-1)
    labels = labels.reshape(-1)
    return {
        "mae": metric_mae.compute(predictions=preds, references=labels)["mae"],
        "rmse": metric_rmse.compute(predictions=preds, references=labels)["rmse"],
    }

args = TrainingArguments(
    output_dir="experiments/hf_transformers",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    learning_rate=LR,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rmse",
    greater_is_better=False,
    seed=SEED,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11971 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  [2m2025-10-31T07:00:39.685624Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x10b1905e0>), traceback: Some(<traceback object at 0x31aa81380>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28



In [None]:
# Evaluate and save
metrics = trainer.evaluate(test_tok)
print(metrics)

trainer.save_model("experiments/hf_transformers/best_model")
tokenizer.save_pretrained("experiments/hf_transformers/best_model")
