In [32]:
import pandas as pd
from pathlib import Path


In [33]:
def extract_stats(path: Path):
    df = pd.read_json(path, lines=True)

    cols = ["eval_en", "eval_de", "eval_city", "eval_disease"]

    exp_names = df["exp_name"].unique()


    ret = {}

    for exp in exp_names:
        small = df[df["exp_name"] == exp]
        res = {}
        for col in cols:
            res[col] = small[col].mean().item()

        ret[exp] = res
    return ret

In [48]:
baseline_path = Path("/workspace/chunky-experiments/baseline-output-qwen2-7b/results_evaluated.jsonl")
baseline_stats = extract_stats(baseline_path)

qwen7b = Path("/workspace/chunky-experiments/experiments/2025-06-22_19-25-13_7B_sen_len_4_opts")
qwen1_7b = Path("/workspace/chunky-experiments/experiments_sentences_scaling/2025-06-22_23-01_qwen1.7b-sentence")
qwen0_6b = Path("/workspace/chunky-experiments/experiments_sentences_scaling/2025-06-22_21-42_sweep")
# list all folders in finetuned_path

results = {}

results["baseline"] = baseline_stats
finetuned_folders = list(finetuned_path.glob("*"))
for file in finetuned_folders:
    results_path = file / "validation_data" / "results_evaluated.jsonl"
    if results_path.exists():
        finetuned_stats = extract_stats(results_path)
        # find first underscore and remove everything before it
        exp_name = file.name[file.name.find("_") + 1:]
        results[exp_name] = finetuned_stats
       
df = pd.DataFrame(results)

Unnamed: 0,baseline,en_short_city_de_long_disease,en_long_city_de_short_disease,en_short_disease_de_long_city,en_long_disease_de_short_city
count,4,4,4,4,4
unique,4,4,4,4,4
top,"{'eval_en': 0.98, 'eval_de': 0.0, 'eval_city':...","{'eval_en': 0.98, 'eval_de': 0.04, 'eval_city'...","{'eval_en': 0.98, 'eval_de': 0.08, 'eval_city'...","{'eval_en': 0.86, 'eval_de': 0.18, 'eval_city'...","{'eval_en': 0.9, 'eval_de': 0.18, 'eval_city':..."
freq,1,1,1,1,1


In [37]:
results

{'baseline': {'start_en': {'eval_en': 0.98,
   'eval_de': 0.0,
   'eval_city': 0.26,
   'eval_disease': 0.08},
  'start_de': {'eval_en': 0.12,
   'eval_de': 0.92,
   'eval_city': 0.0,
   'eval_disease': 0.06},
  'disease': {'eval_en': 1.0,
   'eval_de': 0.0,
   'eval_city': 0.04,
   'eval_disease': 0.84},
  'city': {'eval_en': 0.96,
   'eval_de': 0.04,
   'eval_city': 0.38,
   'eval_disease': 0.02}},
 'en_short_city_de_long_disease': {'start_en': {'eval_en': 0.98,
   'eval_de': 0.04,
   'eval_city': 0.94,
   'eval_disease': 0.04},
  'start_de': {'eval_en': 0.02,
   'eval_de': 1.0,
   'eval_city': 0.0,
   'eval_disease': 1.0},
  'disease': {'eval_en': 0.2,
   'eval_de': 0.84,
   'eval_city': 0.06,
   'eval_disease': 0.9},
  'city': {'eval_en': 1.0,
   'eval_de': 0.0,
   'eval_city': 0.98,
   'eval_disease': 0.0}},
 'en_long_city_de_short_disease': {'start_en': {'eval_en': 0.98,
   'eval_de': 0.08,
   'eval_city': 0.8,
   'eval_disease': 0.1},
  'start_de': {'eval_en': 0.08,
   'eval_de'

In [46]:
r_grouped = {}
# group en_long_city and en_short_city
r_grouped["en_city_de_disease"] = {}

for key in results["en_long_city_de_short_disease"]:
    r_grouped["en_city_de_disease"][key] = {}
    for key2 in results["en_long_city_de_short_disease"][key]:
        r_grouped["en_city_de_disease"][key][key2] = (results["en_long_city_de_short_disease"][key][key2] + results["en_short_city_de_long_disease"][key][key2])/2

r_grouped["en_long_disease_de_short_city"] = {}
for key in results["en_long_disease_de_short_city"]:
    r_grouped["en_long_disease_de_short_city"][key] = {}
    for key2 in results["en_long_disease_de_short_city"][key]:
        r_grouped["en_long_disease_de_short_city"][key][key2] = (results["en_long_disease_de_short_city"][key][key2] + results["en_short_disease_de_long_city"][key][key2])/2

df = pd.DataFrame(r_grouped)

In [47]:
df.describe()

Unnamed: 0,en_city_de_disease,en_long_disease_de_short_city
count,4,4
unique,4,4
top,"{'eval_en': 0.98, 'eval_de': 0.06, 'eval_city'...","{'eval_en': 0.88, 'eval_de': 0.18, 'eval_city'..."
freq,1,1
