In [1]:
import json
from typing import Any

import pandas as pd
from tabulate import tabulate

from juddges.llm_as_judge.data_model import PredictionLoader

In [3]:
pred_dir = "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_32b/info_extraction_annotated_json_refined/personal_rights/seed_42/"
pred_loader = PredictionLoader(root_dir=pred_dir, judge_name="gpt-4.1-mini")
preds = pred_loader.load_predictions(verbose=True)
preds.get_stats()

judge_res = json.loads(pred_loader.llm_judge_scores_file.read_text())


Loading predictions:   0%|          | 0/1811 [00:00<?, ?it/s]

In [8]:
detailed_judge_res = []
field_names = set()
for result in judge_res["all_results"]:
    item_res = {
        "status": result["status"],
        "error": result["error"],
        "missing_keys": result["missing_keys"],
        "extra_keys": result["extra_keys"],
    }
    for field, field_res in result["result"].items():
        field_names.add(field)
        item_res[field] = field_res["score"]
    detailed_judge_res.append(item_res)
detailed_judge_res = pd.DataFrame(detailed_judge_res)
detailed_judge_res.head()

Unnamed: 0,status,error,missing_keys,extra_keys,dowody,inne_podstawy_prawne,miejsce_naruszenia,naruszenie_dobr_osobistych,naruszenie_media_spolecznosciowe,opis_naruszenia,podstawa_prawna,portale_spolecznosciowe,rodzaj_dobra_osobistego,rodzaj_naruszajacego,skala_naruszenia,zadania
0,success,,[],[],0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.5
1,success,,[],[],1.0,0.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0
2,success,,[],[],0.833333,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.25,1.0,1.0,1.0
3,success,,[],[],1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,success,,[],[],0.5,1.0,1.0,1.0,0.0,0.0,0.5,1.0,0.5,1.0,1.0,0.0


# Case-by-case study

In [9]:
field_names

{'dowody',
 'inne_podstawy_prawne',
 'miejsce_naruszenia',
 'naruszenie_dobr_osobistych',
 'naruszenie_media_spolecznosciowe',
 'opis_naruszenia',
 'podstawa_prawna',
 'portale_spolecznosciowe',
 'rodzaj_dobra_osobistego',
 'rodzaj_naruszajacego',
 'skala_naruszenia',
 'zadania'}

In [None]:
import ipywidgets as widgets
from IPython.display import clear_output, display

NUM_EXAMPLES = 50

field_dropdown = widgets.Dropdown(
    options=sorted(field_names),
    value="naruszenie_media_spolecznosciowe"
    if "naruszenie_media_spolecznosciowe" in field_names
    else sorted(field_names)[0],
    description="Field:",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="400px"),
)

output = widgets.Output()


def show_examples(field):
    with output:
        clear_output()
        for idx, result in detailed_judge_res.sort_values(field).head(NUM_EXAMPLES).iterrows():
            try:
                pred = preds.predictions[idx][field]
                missing_keys = preds.missing_keys[idx]
            except KeyError:
                continue

            table = [
                ["Index", idx],
                ["Prediction", pred or "<null>"],
                ["Gold (gpt-4.1)", preds.gold[idx][field] or "<null>"],
                ["Score", result[field]],
                ["was_missing", int(field in missing_keys)],
            ]
            print(
                tabulate(table, headers=["Type", field], tablefmt="grid", maxcolwidths=[None, 50])
            )
            print()


def on_field_change(change):
    if change["type"] == "change" and change["name"] == "value":
        show_examples(change["new"])


field_dropdown.observe(on_field_change)
display(field_dropdown)
show_examples(field_dropdown.value)
display(output)

Dropdown(description='Field:', index=4, layout=Layout(width='400px'), options=('dowody', 'inne_podstawy_prawne…

Output()