In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import textwrap

import pandas as pd
from dotenv import load_dotenv
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
from langchain_openai import ChatOpenAI

from juddges.llm_as_judge.base import SYSTEM_PROMPT, USER_PROMPT
from juddges.llm_as_judge.data_model import PredictionLoader
from juddges.llm_as_judge.judge import StructuredOutputJudge

load_dotenv()

True

In [3]:
pred_dir = "data/experiments/predict/raw_vllm/pl_court_personal_rights/qwen_3_32b/info_extraction_annotated_json_refined/personal_rights/seed_42/"
pred_loader = PredictionLoader(root_dir=pred_dir, judge_name="gpt-4.1-mini")
preds = pred_loader.load_predictions(verbose=True)
preds.get_stats()

judge_res = json.loads(pred_loader.llm_judge_scores_file.read_text())


Loading predictions:   0%|          | 0/1811 [00:00<?, ?it/s]

In [4]:
chat = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
set_llm_cache(SQLiteCache(str(".llm_as_judge_cache.db")))
judge = StructuredOutputJudge(
    client=chat,
    pred_loader=pred_loader,
    max_concurrent_calls=1,
    verbose=True,
)

In [10]:
system_prompt = SYSTEM_PROMPT
user_prompt = USER_PROMPT


system_prompt = """
You are professional judge that evaluates the accuracy of structured outputs.
* Each time you are given <Schema>, <Output>, <Expected Outputs>
* You should evaluate each key separately based on the reference in <Expected Outputs> and the properties in <Schema>.
* When comparing free-form text, asses semantics of the texts, they could differ but the meaning should be the same.
* You should ignore minor typos and formatting differences (e.g different formatting of legal provisions).
* When comparing enum values, you must always check for exact match.
* When comparing lists, 
    * match the most similar items, ignoring their order, 
    * for each pair of items, score them based on the type of the items.
    * return the average score for the list.
* If there is a null value being compared to a non-null value, you should assign a score of 0.
* If a key is in the reference but missing in the output, assign score 0; ignore extra keys in output.
"""
system_prompt = textwrap.dedent(system_prompt)

user_prompt = """
Please evaluate the accuracy of the following output keys according to these schema:
<Schema>
{schema}
</Schema>

<Outputs>
{outputs}
</Outputs>

<Expected Outputs>
{reference_outputs}
</Expected Outputs>
"""
user_prompt = textwrap.dedent(user_prompt)


print(system_prompt)
print("--------------------------------")
print(user_prompt)


You are professional judge that evaluates the accuracy of structured outputs.
* Each time you are given <Schema>, <Output>, <Expected Outputs>
* You should evaluate each key separately based on the reference in <Expected Outputs> and the properties in <Schema>.
* When comparing free-form text, asses semantics of the texts, they could differ but the meaning should be the same.
* You should ignore minor typos and formatting differences (e.g different formatting of legal provisions).
* When comparing enum values, you must always check for exact match.
* When comparing lists, 
    * match the most similar items, ignoring their order, 
    * for each pair of items, score them based on the type of the items.
    * return the average score for the list.
* If there is a null value being compared to a non-null value, you should assign a score of 0.
* If a key is in the reference but missing in the output, assign score 0; ignore extra keys in output.

--------------------------------

Please ev

In [11]:
INDEX = 731
messages = judge.prepare_single_item_messages(
    pred=preds.predictions[INDEX],
    gold=preds.gold[INDEX],
    user_prompt=user_prompt,
    system_prompt=system_prompt,
)
res = await judge.evaluate_single_item(messages)

pd.DataFrame.from_dict(preds.gold[INDEX], orient="index", columns=["Expected Output"]).join(
    pd.DataFrame.from_dict(preds.predictions[INDEX], orient="index", columns=["Predicted Output"])
).join(pd.DataFrame(res.result).T.round(3))

Unnamed: 0,Expected Output,Predicted Output,score
naruszenie_dobr_osobistych,Tak,Tak,1.0
podstawa_prawna,"[24 KC, Inne]","[24 KC, Inne]",1.0
inne_podstawy_prawne,[448 KC],[art. 448 KC],0.5
rodzaj_naruszajacego,Instytucja/firma,Instytucja/firma,1.0
rodzaj_dobra_osobistego,[Inne],[Cześć],0.0
opis_naruszenia,Powodowie doznali naruszenia dobra osobistego ...,Śmierć osoby bliskiej w wyniku wypadku komunik...,0.7
miejsce_naruszenia,Miejsce publiczne,Miejsce publiczne,1.0
naruszenie_media_spolecznosciowe,,Nie,0.0
portale_spolecznosciowe,,,1.0
skala_naruszenia,4,5,0.0
