In [1]:
import pandas as pd
from pathlib import Path


In [2]:
def extract_stats(path: Path):
    df = pd.read_json(path, lines=True)

    cols = ["eval_en", "eval_de", "eval_city", "eval_disease"]

    exp_names = df["exp_name"].unique()


    ret = {}

    for exp in exp_names:
        small = df[df["exp_name"] == exp]
        res = {}
        for col in cols:
            res[col] = small[col].mean().item()

        ret[exp] = res
    return ret

In [16]:
baseline_path = Path("/workspace/chunky-experiments/baseline-output-qwen2-7b/results_evaluated.jsonl")
finetuned_path = Path("/workspace/chunky-experiments/experiments/2025-06-22_19-25-13_7B_sen_len_4_opts")

baseline_stats = extract_stats(baseline_path)

# list all folders in finetuned_path

results = {}

results["baseline"] = baseline_stats
finetuned_folders = list(finetuned_path.glob("*"))
for file in finetuned_folders:
    results_path = file / "validation_data" / "results_evaluated.jsonl"
    if results_path.exists():
        finetuned_stats = extract_stats(results_path)
        # find first underscore and remove everything before it
        exp_name = file.name[file.name.find("_") + 1:]
        results[exp_name] = finetuned_stats
       

In [17]:
results

{'baseline': {'start_en': {'eval_en': 0.98,
   'eval_de': 0.0,
   'eval_city': 0.26,
   'eval_disease': 0.08},
  'start_de': {'eval_en': 0.12,
   'eval_de': 0.92,
   'eval_city': 0.0,
   'eval_disease': 0.06},
  'disease': {'eval_en': 1.0,
   'eval_de': 0.0,
   'eval_city': 0.04,
   'eval_disease': 0.84},
  'city': {'eval_en': 0.96,
   'eval_de': 0.04,
   'eval_city': 0.38,
   'eval_disease': 0.02}},
 'en_short_disease_de_long_city': {'start_en': {'eval_en': 0.86,
   'eval_de': 0.18,
   'eval_city': 0.18,
   'eval_disease': 0.7},
  'start_de': {'eval_en': 0.24,
   'eval_de': 0.96,
   'eval_city': 0.82,
   'eval_disease': 0.08},
  'disease': {'eval_en': 1.0,
   'eval_de': 0.0,
   'eval_city': 0.02,
   'eval_disease': 0.94},
  'city': {'eval_en': 0.22,
   'eval_de': 0.96,
   'eval_city': 0.94,
   'eval_disease': 0.04}}}