In [None]:
from vals import Suite, Test, Check, RunParameters, RunStatus, Run
from vals.sdk.auth import configure_credentials
from dotenv import load_dotenv
import json
import os

load_dotenv()

vals_api_key = os.getenv("VALS_API_KEY") or ""

print("Sourced vals api key=%s" % vals_api_key)
print("Sourced vals env=%s" % os.getenv("VALS_ENV"))

configure_credentials(vals_api_key)

In [None]:
"""
Default functionality for running a test suite. Goes over
- Pulling a test suite from an id
- Create a test suite from scratch
- Starting a run from a test suite
- Waiting for a run to complete
- Fetching the results of a run
- Fetching the test results from a run
- Fetching the qa pairs from a run
- Exporting the results to either a json or csv
"""

In [None]:
# Pull a test suite
SUITE_ID = "xxxx-xxx-42d4-a341-4b233fb88205"
suite = await Suite.from_id(SUITE_ID)

In [None]:
# Or create a test suite
suite = Suite(
    title="Test Suite",
    global_checks=[Check(operator="grammar")],
    tests=[
        Test(
            input_under_test="What is QSBS?",
            checks=[Check(operator="equals", criteria="QSBS")],
        ),
        Test(
            input_under_test="What is an 83 election?",
            checks=[Check(operator="equals", criteria="QSBS")],
        ),
    ],
)

await suite.create()

In [None]:
# Specify the parameters you would like to use for the run
parameters = RunParameters(
    parallelism=3,
    max_output_tokens=2048,
    custom_parameters={
        "top_p": 0.5,
        "text": {"verbosity": "low"},
        "reasoning": {"effort": "low", "summary": "auto"},
    },
)

# Model thats going to be under test
model = "openai/gpt-5-mini-2025-08-07"

run = await suite.run(model=model, parameters=parameters)

# Wait for the run to complete before returning the results
completed_status = await run.wait_for_run_completion()

if completed_status == RunStatus.SUCCESS:
    print("Run completed successfully. Status=%s" % completed_status)
else:
    print("Run failed. Status=%s" % completed_status)

In [None]:
# Pulling the run using the id is supported
RUN_ID = "xxxx-xxx-4709-924e-64d3ed837774"
run = await Run.from_id(RUN_ID)
run_status = run.status


print("Run status: %s" % run_status)
print("Can also visit the run result page directly at %s" % run.url)


print("===Averaged metadata inside of the run===")
print("Average duration: %s" % run.average_duration)
print("Average input tokens: %s" % run.average_input_tokens)
print("Average output tokens: %s" % run.average_output_tokens)

In [None]:
# Pull the test results from the run
test_results = await run.test_results

for test_result in test_results[:2]:
    print(
        "Question=%s, Answer=%s"
        % (test_result.input_under_test[0:50], test_result.llm_output[0:100])
    )

In [None]:
# Filter the results by tags, operators and string
tags_to_filter = ["tag1", "tag2"]
operators_to_filter = ["INCLUDES_ANY"]
search_string = "What is burden shifting under Title VII?"

filtered_test_results = await run.fetch_test_results(
    operators=operators_to_filter, tags=tags_to_filter, search=search_string
)

print("Fetched %s test results" % len(filtered_test_results))
print(filtered_test_results)


In [None]:
# Can also retrieve just the question answer pairs
qa_pairs = await run.qa_pairs

for qa_pair in qa_pairs[:2]:
    print(
        "Question=%s, Answer=%s"
        % (qa_pair.input_under_test[0:50], qa_pair.llm_output[0:100])
    )

In [None]:
# Supports two file format types
os.makedirs("data_files", exist_ok=True)

# Json
json_data = await run.fetch_json()
with open("data_files/run.json", "w") as f:
    json.dump(json_data, f, indent=2)

# Csv
run_result_df, test_results_df = await run.fetch_csv()
combined = (
    run_result_df.to_csv(index=False) + "\n" + test_results_df.to_csv(index=False)
)

with open("data_files/run.csv", "w") as f:
    f.write(combined)

In [None]:
# List runs using search params

SUITE_ID = "xxxx-xxx-42d4-a341-4b233fb88205"
status = RunStatus.SUCCESS
model_under_test = "openai/gpt-4o-mini"

results = await Run.list_runs(
    suite_id=SUITE_ID,
    model_under_test=model_under_test,
    status=status,
)

for result in results[:10]:
    print(str(result))

In [None]:
"""
Extra functionality for runs, can use these when the default options are not sufficient

- Custom model functions
- Starting a run from question answer pairs
"""

In [None]:
from vals.sdk.types import QuestionAnswerPair
from typing import Any
from io import BytesIO
from openai import OpenAI

In [None]:
# Going to need to create a method that when provided an input will return a response from a model
# Do not need all these fields but showing you everything you can pass in
async def query_model(
    input: str, files: dict[str, BytesIO], context: dict[str, Any]
) -> str:
    openai_api_key = os.getenv("OPENAI_API_KEY")

    assert openai_api_key is not None, "OPENAI_API_KEY is not set"

    openai_client = OpenAI(api_key=openai_api_key)

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": input}],
    )

    content = response.choices[0].message.content
    if not content:
        raise ValueError("No response from model")

    return content

In [None]:
# Create a custom model function that will be used to run the suite
async def custom_model(
    input: str, files: dict[str, BytesIO], context: dict[str, Any]
) -> str:
    """
    Custom model function instantiating a openai client that will be used to generate a response

    You can use the context or files provided inside of the test you created earlier to generate a response
    """

    # Use that query_model function we created earlier to generate a response
    return await query_model(input, files, context)

In [None]:
# Create the suite, specifying custom operators inside the checks if needed
suite = Suite(
    title="Test Suite with custom operators",
    tests=[
        Test(
            input_under_test="What is QSBS?",
            checks=[Check(operator="equals", criteria="QSBS")],
            context={
                "message_history": [
                    {"role": "user", "content": "What is QSBS?"},
                    {"role": "assistant", "content": "QSBS is a company."},
                ]
            },
            files_under_test=["../data_files/postmoney_safe.docx"],
        ),
    ],
)

await suite.create()

In [None]:
# Pass in the custom model function we created earlier
run = await suite.run(
    model=custom_model,
    wait_for_completion=True,
    parameters=RunParameters(parallelism=3),
)

print(f"Run URL: {run.url}")
print(f"Pass rate: {run.pass_rate}")

for test_result in await run.test_results:
    print(
        f"Question={test_result.input_under_test[0:50]}, Answer={test_result.llm_output[0:100]}"
    )

In [None]:
# You can also start a run with previous data by constructing a question answer pair
# This also supports metadata, output context, and more
qa_pairs = [
    QuestionAnswerPair(
        input_under_test="What is QSBS?",
        llm_output="QSBS is a company.",
        # If there are duplicated questions with unique files, provide the file ids here to match the question answer pair to the test
    )
]

# model name can be arbitrary, evaluation will be done inside of the platform using the question and answer pair set that you uploaded
run = await suite.run(
    model=qa_pairs, model_name="gpt-4o-mini", wait_for_completion=True
)

print(f"Run URL: {run.url}")
print(f"Pass rate: {run.pass_rate}")