In [None]:
from datetime import datetime
from pathlib import Path

base_dir = Path("data")

archives_dir = base_dir / "archives"
ground_truth_dir = base_dir / "ground_truth"

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
working_dir = base_dir / "eval" / timestamp

for d in [archives_dir, ground_truth_dir, working_dir]:
    d.mkdir(exist_ok=True, parents=True)

In [None]:
from src.soda_curation.eval import run_and_score_tasks, Task, Strategy

n_repeats = 1
tasks = [
    Task.FIGURE_LEGEND,
    Task.FIGURE_CAPTIONS,
]
strategies = [
    # Strategy.CLAUDE,
    Strategy.OPENAI,
    Strategy.REGEX,
]

results = run_and_score_tasks(archives_dir, ground_truth_dir, working_dir, n_repeats=n_repeats, tasks=tasks, strategies=strategies)
len(results)

In [None]:
import json

with open(working_dir / "results.json", "w") as f:
    json.dump(results, f, indent=2, sort_keys=True)

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        "task": r["task"],
        "strategy": r["strategy"],
        "msid": r["msid"],
        "run_id": r["run_id"],
        "accuracy": r["score"]["accuracy"],
        "precision": r["score"]["precision"],
        "recall": r["score"]["recall"],
        "f1": r["score"]["f1"],
    }
    for r in results
])
df

In [None]:
df.groupby(["task", "strategy", "msid"]).describe()[["precision", "recall"]]