In [1]:
import ipytest
ipytest.autoconfig(addopts=[
    "--assert=plain",
    "--run-eval",
    "--run-eval-analysis",
    "-s",  # Don't capture output
    "--log-cli-level=ERROR"
])

In [2]:
import openai
def classify(text: str) -> bool:
    """Simple LLM agent that classifies text as computer-related or not."""
    resp = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "Is this text about a computer-related subject?"
                           "Reply ONLY with either true or false.",
            },
            {"role": "user", "content": text},
        ],
    )
    return resp.choices[0].message.content.lower() == "true"

In [3]:
TEST_DATA = [
    {"text": "I need to debug this Python code", "label": True},
    {"text": "The cat jumped over the lazy dog", "label": False},
    {"text": "My monitor keeps flickering", "label": True},
]

In [4]:
%%ipytest
ipytest.force_reload("pytest_harvest") # Force reload the plugin, this is a workaround to run the evaluation tests multiple times in the notebook, and resetting the plugin state

import pytest
@pytest.fixture
def classifier():
    return classify

@pytest.mark.eval(name="computer_classifier")
@pytest.mark.parametrize("case", TEST_DATA)
def test_classifier(case: dict, eval_bag, classifier):
    eval_bag.input_text = case["text"]
    eval_bag.label = case["label"]
    eval_bag.prediction = classifier(case["text"])
    print(f"Input: {eval_bag.input_text}")
    print(f"Prediction: {eval_bag.prediction}")
    assert eval_bag.prediction == eval_bag.label


@pytest.mark.eval_analysis(name="computer_classifier")
def test_analysis(eval_results):
    total = len(eval_results)
    correct = sum(1 for r in eval_results if r.result.prediction == r.result.label)
    accuracy = correct / total

    print(f"Accuracy: {accuracy:.2%}")
    assert accuracy >= 0.7


t_73ebd04313644f43af72ba306c8af586.py::test_classifier[case0] Input: I need to debug this Python code
Prediction: True
[32mPASSED[0m
t_73ebd04313644f43af72ba306c8af586.py::test_classifier[case1] Input: The cat jumped over the lazy dog
Prediction: False
[32mPASSED[0m
t_73ebd04313644f43af72ba306c8af586.py::test_classifier[case2] Input: My monitor keeps flickering
Prediction: True
[32mPASSED[0m
t_73ebd04313644f43af72ba306c8af586.py::test_analysis Accuracy: 100.00%
[32mPASSED[0m

