üìì MLflow Scorers ‚Äî Fully Self-Contained Jupyter Notebook (Basic Auth)

In [9]:
# ============================================================
# 0. Imports
# ============================================================
import os
import mlflow
import pandas as pd
from mlflow.genai.scorers import scorer
from mlflow.entities import Feedback

üîê 1. Configure MLflow Tracking + Basic Auth (Notebook-only)

In [None]:
# ============================================================
# 1. MLflow tracking server configuration
# ============================================================

MLFLOW_TRACKING_URI = "http://localhost:8080"
MLFLOW_USERNAME = "frank@example.com"
MLFLOW_PASSWORD = ""

os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_USERNAME
os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_PASSWORD

# Optional but useful
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("scorers-notebook-test")

<Experiment: artifact_location='mlflow-artifacts:/24', creation_time=1767983770216, experiment_id='24', last_update_time=1767983770216, lifecycle_stage='active', name='scorers-notebook-test', tags={'mlflow.experimentKind': 'genai_development'}>

‚úÖ 2. Sanity Check: Auth Works

In [11]:
# ============================================================
# 2. Auth sanity check
# ============================================================

with mlflow.start_run(run_name="auth-check") as run:
    mlflow.log_metric("auth_ok", 1.0)
    print("Auth OK, run ID:", run.info.run_id)

Auth OK, run ID: 97d1b604a43c4d019c6fc717c9ae95e0
üèÉ View run auth-check at: http://localhost:8080/#/experiments/24/runs/97d1b604a43c4d019c6fc717c9ae95e0
üß™ View experiment at: http://localhost:8080/#/experiments/24


üß† 3. Define Dummy Custom Scorers

In [12]:
# ============================================================
# 3. Custom scorers
# ============================================================


@scorer
def response_length(outputs: dict) -> Feedback:
    text = outputs.get("response", "")
    length = len(text)
    return Feedback(
        value=length,
        rationale=f"Response length = {length}",
        metadata={"chars": length},
    )


@scorer
def contains_hello(outputs: dict) -> Feedback:
    text = outputs.get("response", "").lower()
    found = "hello" in text
    return Feedback(
        value=found,
        rationale=f"'hello' present: {found}",
        metadata={},
    )

üìä 4. Create Dummy Evaluation Data


In [13]:
# ============================================================
# 4. Dummy evaluation dataset
# ============================================================

df = pd.DataFrame(
    [
        {
            "inputs": {"prompt": "Explain MLflow"},
            "outputs": {"response": "Hello! MLflow helps track experiments."},
        },
        {
            "inputs": {"prompt": "2 + 2"},
            "outputs": {"response": "The answer is 4."},
        },
        {
            "inputs": {"prompt": "Say hello"},
            "outputs": {"response": "hello world"},
        },
        {
            "inputs": {"prompt": "Check scoring"},
            "outputs": {"response": "code scoring example"},
        },
    ]
)

üß™ 5. Run MLflow GenAI Evaluation (Scorers)


In [14]:
# ============================================================
# 5. Run evaluation with scorers
# ============================================================

with mlflow.start_run(run_name="scorers-eval"):
    result = mlflow.genai.evaluate(
        data=df,
        scorers=[
            response_length,
            contains_hello,
        ],
    )

print("Logged metrics:")
for k, v in result.metrics.items():
    print(f"  {k}: {v}")

Logged metrics:
  response_length/mean: 21.25
  contains_hello/mean: 0.5
