In [None]:
from datetime import datetime
from pathlib import Path

base_dir = Path("data")

archives_dir = base_dir / "archives"
ground_truth_dir = base_dir / "ground_truth"

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
working_dir = base_dir / "eval" / timestamp

for d in [archives_dir, ground_truth_dir, working_dir]:
    d.mkdir(exist_ok=True, parents=True)

In [None]:
from src.soda_curation.eval import run_and_score_tasks, Task, Strategy

n_repeats = 1
tasks = [
    Task.FIGURE_LEGEND,
    Task.FIGURE_CAPTIONS,
]
strategies = [
    # Strategy.CLAUDE,
    Strategy.OPENAI,
    Strategy.REGEX,
]

results = run_and_score_tasks(archives_dir, ground_truth_dir, working_dir, n_repeats=n_repeats, tasks=tasks, strategies=strategies)
len(results)

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        "task": r["task"],
        "strategy": r["strategy"],
        "msid": r["msid"],
        "run_id": r["run_id"],
        "accuracy": r["score"]["accuracy"],
        "precision": r["score"]["precision"],
        "recall": r["score"]["recall"],
        "f1": r["score"]["f1"],
    }
    for r in results
])
df

In [None]:
def _diffs_to_md_table(diffs):
    md_table = [
        "| Type | Ground Truth | Extracted |",
        "| --- | --- | --- |",
    ]
    for diff in diffs:
        a = diff["a"].replace("\n", "\\n").replace("|", "\|")
        b = diff["b"].replace("\n", "\\n").replace("|", "\|")
        type = diff["type"]
        md_table.append(f"| {type} | {a} | {b} |")
    return md_table

def to_md(results):
    md = []
    results_by_msid = {}
    for r in results:
        results_by_msid.setdefault(r["msid"], []).append(r)
    md.append("# Evaluation Results")
    md.append("## Summary")
    
    md.append(df.groupby(["task", "strategy"]).describe()[["precision", "recall", "f1", "accuracy"]].T.to_markdown())
    md.append(df.groupby(["task", "strategy", "msid"]).describe()[["precision", "recall", "f1", "accuracy"]].T.to_markdown())
    for msid, ms_results in results_by_msid.items():
        md.append(f"## Manuscript: {msid}")
        for result in ms_results:
            task = result["task"]
            strategy = result["strategy"]
            score = result["score"]
            md.append(f"### Task: {task}, Strategy: {strategy}")
            md.append(f"**F1 Score**: {score['f1']:.2f}")
            md.append(f"**Precision**: {score['precision']:.2f}")
            md.append(f"**Recall**: {score['recall']:.2f}")
            md.append(f"**Accuracy**: {score['accuracy']:.2f}")
            md.append(f"**TP**: {score['tp']}")
            md.append(f"**FP**: {score['fp']}")
            md.append(f"**FN**: {score['fn']}")
            md.append(f"**Diff**:")
            md.extend(_diffs_to_md_table(score["diff"]))

    return "\n".join(md)

In [None]:
import json

with open(working_dir / "results.json", "w") as f:
    json.dump(results, f, indent=2, sort_keys=True)

with open(working_dir / "results.md", "w") as f:
    f.write(to_md(results))

In [None]:
df.groupby(["task", "strategy", "msid"]).describe()[["precision", "recall"]]