In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pandas as pd
from tqdm.auto import tqdm

from juddges.llm_as_judge.data_model import PredictionLoader
from juddges.llm_as_judge.result_loading import (
    llm_as_judge_avg_scores,
    ngram_avg_scores,
)

In [77]:
res_dirs = [
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_32b/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_8b/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    "data/experiments/predict/raw_vllm/pl_court_personal_rights/llama_3.1_8b_instruct/info_extraction_annotated_json_refined/personal_rights/seed_42/",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/llama_3.1_8b_instruct/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/qwen_3_8b/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
    # "data/experiments/predict/raw_vllm/pl_court_swiss_franc_loans/qwen_3_32b/info_extraction_annotated_json_refined/swiss_franc_loans_refined/seed_42",
]


judge_resutls = {}
ngram_results = {}
for rdir in tqdm(res_dirs):
    pred_loader = PredictionLoader(root_dir=rdir, judge_name="gpt-4.1-mini")
    preds = pred_loader.load_predictions(verbose=True)
    try:
        res_judge = llm_as_judge_avg_scores(pred_loader)
    except FileNotFoundError:
        print(f"File not found for {rdir}")
    else:
        judge_resutls[pred_loader.config.llm.name] = res_judge

    res_ngram = ngram_avg_scores(pred_loader)
    ngram_results[pred_loader.config.llm.name] = res_ngram

  0%|          | 0/3 [00:00<?, ?it/s]

Loading predictions:   0%|          | 0/1811 [00:00<?, ?it/s]

Loading predictions:   0%|          | 0/1811 [00:00<?, ?it/s]

Loading predictions:   0%|          | 0/1811 [00:00<?, ?it/s]

In [78]:
dfs = []
for llm_name, res_judge in judge_resutls.items():
    llm_name = llm_name.split("/")[-1]
    mean_col = f"{llm_name} (mean)"
    se_col = f"{llm_name} (SE)"
    res_judge = res_judge.rename(columns={"mean_judge_score": mean_col, "se_judge_score": se_col})
    dfs.append(res_judge)

judge_df = pd.concat(dfs, axis=1)
judge_df_mean = judge_df[[col for col in judge_df.columns if col.endswith("(mean)")]]
judge_df_se = judge_df[[col for col in judge_df.columns if col.endswith("(SE)")]]
judge_df_mean.round(3)

Unnamed: 0_level_0,Qwen3-32B (mean),Qwen3-8B (mean),Llama-3.1-8B-Instruct (mean)
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dowody,0.856,0.548,0.426
inne_podstawy_prawne,0.414,0.057,0.337
miejsce_naruszenia,0.535,0.279,0.395
naruszenie_dobr_osobistych,0.768,0.784,0.695
naruszenie_media_spolecznosciowe,0.22,0.075,0.135
opis_naruszenia,0.656,0.426,0.124
podstawa_prawna,0.837,0.776,0.319
portale_spolecznosciowe,0.945,0.15,0.715
rodzaj_dobra_osobistego,0.585,0.282,0.24
rodzaj_naruszajacego,0.846,0.699,0.644


In [79]:
llms = list({col.split(" ")[0] for col in judge_df.columns})

formatted = {}
for index, row in judge_df.iterrows():
    formatted[index] = {}
    for llm_name in llms:
        mean = row[f'{llm_name} (mean)'] * 100
        se = row[f'{llm_name} (SE)'] * 100
        formatted[index][llm_name] = f"{mean:.3f} ({se:.3f})"

print(pd.DataFrame.from_dict(formatted, orient="index").to_latex())

\begin{tabular}{llll}
\toprule
 & Qwen3-8B & Qwen3-32B & Llama-3.1-8B-Instruct \\
\midrule
dowody & 54.776 (0.744) & 85.619 (0.583) & 42.582 (0.929) \\
inne_podstawy_prawne & 5.719 (0.525) & 41.404 (1.046) & 33.703 (1.109) \\
miejsce_naruszenia & 27.940 (1.055) & 53.451 (1.172) & 39.481 (1.148) \\
naruszenie_dobr_osobistych & 78.410 (0.967) & 76.808 (0.992) & 69.464 (1.083) \\
naruszenie_media_spolecznosciowe & 7.454 (0.617) & 22.004 (0.953) & 13.473 (0.801) \\
opis_naruszenia & 42.620 (0.753) & 65.555 (0.659) & 12.385 (0.774) \\
podstawa_prawna & 77.642 (0.618) & 83.733 (0.617) & 31.890 (0.821) \\
portale_spolecznosciowe & 15.021 (0.833) & 94.469 (0.530) & 71.510 (1.054) \\
rodzaj_dobra_osobistego & 28.168 (0.805) & 58.499 (0.948) & 24.032 (0.854) \\
rodzaj_naruszajacego & 69.851 (1.079) & 84.594 (0.849) & 64.384 (1.126) \\
skala_naruszenia & 65.191 (1.052) & 57.924 (1.104) & 31.960 (1.079) \\
zadania & 66.279 (0.919) & 64.141 (0.932) & 10.799 (0.555) \\
\bottomrule
\end{tabular}



In [80]:
per_llm_mean = judge_df_mean.mean(axis=0) * 100
per_llm_se = (judge_df_se.pow(2).sum(axis=0) / len(judge_df_se)**2).pow(1/2) * 100

print(per_llm_mean.to_latex())
print(per_llm_se.to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Qwen3-32B (mean) & 65.683532 \\
Qwen3-8B (mean) & 44.922624 \\
Llama-3.1-8B-Instruct (mean) & 37.138748 \\
\bottomrule
\end{tabular}

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Qwen3-32B (SE) & 0.256900 \\
Qwen3-8B (SE) & 0.245291 \\
Llama-3.1-8B-Instruct (SE) & 0.277336 \\
\bottomrule
\end{tabular}



In [34]:
dfs = []
for llm_name, res_ngram in ngram_results.items():
    llm_name = llm_name.split("/")[-1]
    mean_col = f"{llm_name} (mean)"
    se_col = f"{llm_name} (SE)"
    res_ngram = res_ngram.rename(columns={"ngram_metric_mean": mean_col, "ngram_metric_se": se_col})
    dfs.append(res_ngram)

ngram_df = pd.concat(dfs, axis=1)
ngram_df_mean = ngram_df[[col for col in ngram_df.columns if col.endswith("(mean)")]]
ngram_df_se = ngram_df[[col for col in ngram_df.columns if col.endswith("(SE)")]]
ngram_df_mean.round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Llama-3.1-8B-Instruct (mean),Qwen3-8B (mean),Qwen3-32B (mean)
field,ngram_metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aneks_do_umowy,exact_match,0.621,0.866,0.904
apelacja,rougeL,0.434,0.209,0.314
beneficjent_kosztow,rougeL,0.381,0.276,0.504
czas_trwania_sprawy,rougeL,0.166,0.026,0.137
data_aneksu,rougeL,0.587,0.727,0.76
data_rozpoczecia_odsetek,exact_match,0.413,0.478,0.613
data_wyroku,rougeL,0.684,0.994,0.989
dowody,f1,0.303,0.722,0.806
instancja_sadu,exact_match,0.307,0.992,0.988
klauzula_niedozwolona,exact_match,0.66,0.894,0.916


In [53]:
per_llm_mean = ngram_df_mean.mean(axis=0) * 100
per_llm_se = (ngram_df_se.pow(2).sum(axis=0) / len(ngram_df_se)**2).pow(1/2) * 100

print(per_llm_mean.to_latex())
print(per_llm_se.to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Llama-3.1-8B-Instruct (mean) & 40.537122 \\
Qwen3-8B (mean) & 55.188013 \\
Qwen3-32B (mean) & 71.355111 \\
\bottomrule
\end{tabular}

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Llama-3.1-8B-Instruct (SE) & 0.278748 \\
Qwen3-8B (SE) & 0.283812 \\
Qwen3-32B (SE) & 0.293519 \\
\bottomrule
\end{tabular}

