In [None]:
%pip install openai
# For development:
%pip install ~/scorecard-python
# For production:
# %pip install scorecard-ai

In [2]:
# Fill in your API keys
OPENAI_API_KEY = ""
SCORECARD_API_KEY = ""

In [3]:
from typing_extensions import List

from openai import OpenAI

from scorecard_ai import Scorecard
from scorecard_ai.lib import run_and_evaluate

scorecard = Scorecard(bearer_token=SCORECARD_API_KEY)
openai = OpenAI(api_key=OPENAI_API_KEY)

In [4]:
# Fill in your Project ID and Metric IDs
PROJECT_ID: str = ""
METRIC_IDS: List[str] = []

In [5]:
# The "system under test" -- the AI system that you want to evaluate.
def run_system(input):
    response = openai.responses.create(
        model="gpt-4o-mini",
        instructions=f"You are a tone translator that converts a user's message to a different tone ({input['tone']}). Address the recipient: {input.get('recipient')}",
        input=input["original"],
    )
    return {"rewritten": response.output_text}

In [None]:
# Create a Testset with a schema matching our use case
testset = scorecard.testsets.create(
    project_id=PROJECT_ID,
    name="Tone rewriter testset",
    description="Testcases about rewriting messages in a different tone.",
    field_mapping={
        # Inputs are fields that represent the input to the AI system.
        "inputs": ["original", "recipient", "tone"],
        # Labels are fields represent the expected output of the AI system.
        "labels": ["idealRewritten"],
        # Metadata fields are used for grouping Testcases, but not seen by the AI system.
        "metadata": [],
    },
    json_schema={
        "type": "object",
        "properties": {
            # The original message.
            "original": {"type": "string"},
            # The recipient of the message.
            "recipient": {"type": "string"},
            # The tone that the message should be rewritten in.
            "tone": {"type": "string"},
            # The ideal AI-generated rewritten message.
            "idealRewritten": {"type": "string"},
        },
        "required": ["original", "tone", "idealRewritten"],
    },
)

print(testset)  # noqa: T201

In [7]:
# Add Testcases matching the Testset's schema to the Testset
testcase_response = scorecard.testcases.create(
    testset_id=testset.id,
    items=[
        {
            "json_data": {
                "original": "We need your feedback on the new designs ASAP.",
                "tone": "polite",
                "recipient": "Darius",
                "idealRewritten": "Hi Darius, your feedback is crucial to the success of the new designs. Please share your thoughts as soon as possible.",
            },
        },
        {
            "json_data": {
                "original": "I'll be late to the office because my cat is sleeping on my keyboard.",
                "tone": "funny",
                "recipient": "team",
                # This should return a validation error because it's missing the `idealRewritten` field.
                "fieldNameWithTypo": "Hey team! My cat's napping on my keyboard and I'm just waiting for her to give me permission to leave. I'll be a bit late!",
            },
        },
        {
            "json_data": {
                "original": "Schedule a meeting to discuss this project.",
                "tone": "casual",
                "idealRewritten": "Let's find a time to chat about the project. Coffee or boba?",
            },
        },
    ],
)

In [None]:
# # Create a new Run on the Testset with the given Metrics.
run_response = run_and_evaluate(
    client=scorecard,
    project_id=PROJECT_ID,
    testset_id=testset.id,
    metric_ids=METRIC_IDS,
    system=lambda input: run_system(input),
)
print(f"Go to {run_response['url']} and click 'Run Scoring' to grade your Records.")  # noqa: T201

In [None]:
# Async version of the above cell

"""

from scorecard_ai.lib import async_run_and_evaluate
from scorecard_ai import AsyncScorecard

async_scorecard = AsyncScorecard(
    bearer_token=SCORECARD_API_KEY
)

run_response = await async_run_and_evaluate(
    client=async_scorecard,
    project_id=PROJECT_ID,
    testset_id=testset.id,
    metric_ids=METRIC_IDS,
    system=lambda input: run_system(input)
)
print(f"Go to {run_response['url']} and click 'Run Scoring' to grade your Records.")  # noqa: T201

"""