In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from tqdm.auto import tqdm

from juddges.llm_as_judge.data_model import PredictionLoader
from juddges.llm_as_judge.result_loading import (
    llm_as_judge_avg_scores,
    ngram_avg_scores,
)

In [None]:
res_dirs = [
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_32b/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_8b/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/llama_3.1_8b_instruct/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/llama_3.1_8b_instruct/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/qwen_3_8b/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/qwen_3_32b/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
]


judge_resutls = {}
ngram_results = {}
for rdir in tqdm(res_dirs):
    pred_loader = PredictionLoader(root_dir=rdir, judge_name="gpt-4.1-mini")
    preds = pred_loader.load_predictions(verbose=True)
    try:
        res_judge = llm_as_judge_avg_scores(pred_loader)
    except FileNotFoundError:
        print(f"File not found for {rdir}")
    else:
        judge_resutls[pred_loader.config.llm.name] = res_judge

    res_ngram = ngram_avg_scores(pred_loader)
    ngram_results[pred_loader.config.llm.name] = res_ngram

In [None]:
dfs = []
for llm_name, res_judge in judge_resutls.items():
    llm_name = llm_name.split("/")[-1]
    mean_col = f"{llm_name} (mean)"
    se_col = f"{llm_name} (SE)"
    res_judge = res_judge.rename(columns={"mean_judge_score": mean_col, "se_judge_score": se_col})
    dfs.append(res_judge)

judge_df = pd.concat(dfs, axis=1)
judge_df_mean = judge_df[[col for col in judge_df.columns if col.endswith("(mean)")]]
judge_df_se = judge_df[[col for col in judge_df.columns if col.endswith("(SE)")]]
judge_df_mean.round(3)

In [None]:
llms = list({col.split(" ")[0] for col in judge_df.columns})

formatted = {}
for index, row in judge_df.iterrows():
    formatted[index] = {}
    for llm_name in llms:
        mean = row[f'{llm_name} (mean)'] * 100
        se = row[f'{llm_name} (SE)'] * 100
        formatted[index][llm_name] = f"{mean:.3f} ({se:.3f})"

print(pd.DataFrame.from_dict(formatted, orient="index").to_latex())

In [None]:
per_llm_mean = judge_df_mean.mean(axis=0) * 100
per_llm_se = (judge_df_se.pow(2).sum(axis=0) / len(judge_df_se)**2).pow(1/2) * 100

print(per_llm_mean.to_latex())
print(per_llm_se.to_latex())

In [None]:
dfs = []
for llm_name, res_ngram in ngram_results.items():
    llm_name = llm_name.split("/")[-1]
    mean_col = f"{llm_name} (mean)"
    se_col = f"{llm_name} (SE)"
    res_ngram = res_ngram.rename(columns={"ngram_metric_mean": mean_col, "ngram_metric_se": se_col})
    dfs.append(res_ngram)

ngram_df = pd.concat(dfs, axis=1)
ngram_df_mean = ngram_df[[col for col in ngram_df.columns if col.endswith("(mean)")]]
ngram_df_se = ngram_df[[col for col in ngram_df.columns if col.endswith("(SE)")]]
ngram_df_mean.round(3)


In [None]:
per_llm_mean = ngram_df_mean.mean(axis=0) * 100
per_llm_se = (ngram_df_se.pow(2).sum(axis=0) / len(ngram_df_se)**2).pow(1/2) * 100

print(per_llm_mean.to_latex())
print(per_llm_se.to_latex())