# LLM-as-a-Judge with Computed Fields

This notebook demonstrates how to use Pydantic's `@computed_field` decorator with LLM-as-a-Judge evaluations.

**Key Benefits:**
- LLM provides raw scores/metrics only
- Pass/fail logic is computed deterministically in Python
- Consistent evaluation logic across all responses
- Business rules are testable and version-controlled

In [1]:
from typing import Any
from pydantic import BaseModel, computed_field
from flex_evals import LLMJudgeCheck, evaluate, TestCase, Output
from sik_llms import create_client, system_message, user_message
import nest_asyncio

from dotenv import load_dotenv
load_dotenv()

nest_asyncio.apply()  # for running async function in a Jupyter notebook

In [2]:
# Define response format with computed 'passed' field
class QualityJudgeResponse(BaseModel):
    """LLM provides score, confidence, and reasoning. 'passed' is computed automatically."""

    score: int
    reasoning: str

    @computed_field
    @property
    def passed(self) -> bool:
        """Automatically determine pass/fail based on score threshold."""
        return self.score >= 80


async def llm_judge(
        prompt: str,
        response_format: type[BaseModel],
    ) -> tuple[BaseModel, dict[str, Any]]:
    """
    LLM judge function that evaluates based on the given prompt.

    Returns:
        Tuple of (parsed_response, metadata_dict)
    """
    model_name = 'gpt-4o-mini'
    # Create client with the specified response format
    client = create_client(
        model_name=model_name,
        response_format=response_format,
    )

    # Create messages for the evaluation
    messages = [
        system_message("This is a test. Please follow the instructions carefully."),
        user_message(prompt),
    ]

    # Get evaluation from LLM
    response = await client.run_async(messages=messages)

    # Extract metadata from the response
    metadata = {
        "cost_usd": response.input_cost + response.output_cost,
        "tokens_used": response.input_tokens + response.output_tokens,
        "input_tokens": response.input_tokens,
        "output_tokens": response.output_tokens,
        "response_time_ms": int(response.duration_seconds * 1000),
        "model_version": model_name,
    }

    return response.parsed, metadata


In [3]:
# Create test case with LLM judge check
test_cases = [
    TestCase(
        id='quality_check_001',
        input='What is the capital of France?',
        checks=[
            LLMJudgeCheck(
                prompt="This is a test. Please return a score of 0.85 and a concise fictitious reason.",  # noqa: E501
                response_format=QualityJudgeResponse,
                llm_function=llm_judge,
            ),
        ],
    ),
]

# Run evaluation
outputs = [Output(value='The capital of France is Paris.')]
results = evaluate(test_cases, outputs)

In [None]:
# Display results
check_result = results.results[0].check_results[0].results
print('Evaluation Results:')
print(f"  Score: {check_result['score']}")
print(f"  Passed: {check_result['passed']}")  # ← Computed automatically
print(f"  Reasoning: {check_result['reasoning']}")
print(f"  Metadata: {check_result['judge_metadata']}")

Evaluation Results:
  Score: 85
  Passed: True
  Reasoning: The response accurately addresses the prompt with a clear understanding of the test requirements, demonstrating effective communication.
  Metadata: {'cost_usd': 3.3749999999999994e-05, 'tokens_used': 141, 'input_tokens': 113, 'output_tokens': 28, 'response_time_ms': 2709, 'model_version': 'gpt-4o-mini'}


---