From 99a9c7c204db087ec2d5368d1a13d18702b75182 Mon Sep 17 00:00:00 2001 From: Darren Wang Date: Sat, 28 Mar 2026 00:13:14 +0000 Subject: [PATCH] docs: add correctness, goal success rate, coherence evaluator examples Co-authored-by: Kang Zhou Co-authored-by: Subramanian Chidambaram --- .../examples/evals-sdk/coherence_evaluator.py | 50 +++++++++++++++++ .../evals-sdk/correctness_evaluator.py | 42 +++++++++++++++ .../correctness_with_assertions_evaluator.py | 53 +++++++++++++++++++ ..._success_rate_with_assertions_evaluator.py | 53 +++++++++++++++++++ 4 files changed, 198 insertions(+) create mode 100644 docs/examples/evals-sdk/coherence_evaluator.py create mode 100644 docs/examples/evals-sdk/correctness_evaluator.py create mode 100644 docs/examples/evals-sdk/correctness_with_assertions_evaluator.py create mode 100644 docs/examples/evals-sdk/goal_success_rate_with_assertions_evaluator.py diff --git a/docs/examples/evals-sdk/coherence_evaluator.py b/docs/examples/evals-sdk/coherence_evaluator.py new file mode 100644 index 000000000..7310d9d8f --- /dev/null +++ b/docs/examples/evals-sdk/coherence_evaluator.py @@ -0,0 +1,50 @@ +from strands import Agent + +from strands_evals import Case, Experiment +from strands_evals.evaluators import CoherenceEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Define a task function +def user_task_function(case: Case) -> dict: + agent = Agent( + # IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper + # to prevent spans from different test cases from being mixed together in the memory exporter + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + callback_handler=None, + ) + agent_response = agent(case.input) + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(agent_response), "trajectory": session} + + +# 2. Create test cases +test_cases = [ + Case[str, str]( + name="multi-step-reasoning", + input="Explain how photosynthesis works and why it is important for life on Earth.", + metadata={"category": "coherence"}, + ), + Case[str, str]( + name="compare-contrast", + input="Compare and contrast renewable and non-renewable energy sources.", + metadata={"category": "coherence"}, + ), +] + +# 3. Create evaluators +evaluators = [CoherenceEvaluator()] + +# 4. Create an experiment +experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators) + +# 5. Run evaluations +reports = experiment.run_evaluations(user_task_function) +reports[0].run_display() diff --git a/docs/examples/evals-sdk/correctness_evaluator.py b/docs/examples/evals-sdk/correctness_evaluator.py new file mode 100644 index 000000000..36bb7f886 --- /dev/null +++ b/docs/examples/evals-sdk/correctness_evaluator.py @@ -0,0 +1,42 @@ +from strands import Agent + +from strands_evals import Case, Experiment +from strands_evals.evaluators import CorrectnessEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Define a task function +def user_task_function(case: Case) -> dict: + agent = Agent( + # IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper + # to prevent spans from different test cases from being mixed together in the memory exporter + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + callback_handler=None, + ) + agent_response = agent(case.input) + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(agent_response), "trajectory": session} + + +# 2. Create test cases +test_cases = [ + Case[str, str](name="math-1", input="What is 25 * 4?", metadata={"category": "math"}), + Case[str, str](name="math-2", input="Calculate the square root of 144", metadata={"category": "math"}), +] + +# 3. Create evaluators +evaluators = [CorrectnessEvaluator()] + +# 4. Create an experiment +experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators) + +# 5. Run evaluations +reports = experiment.run_evaluations(user_task_function) +reports[0].run_display() diff --git a/docs/examples/evals-sdk/correctness_with_assertions_evaluator.py b/docs/examples/evals-sdk/correctness_with_assertions_evaluator.py new file mode 100644 index 000000000..f33c24696 --- /dev/null +++ b/docs/examples/evals-sdk/correctness_with_assertions_evaluator.py @@ -0,0 +1,53 @@ +from strands import Agent + +from strands_evals import Case, Experiment +from strands_evals.evaluators import CorrectnessEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Define a task function +def user_task_function(case: Case) -> dict: + agent = Agent( + # IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper + # to prevent spans from different test cases from being mixed together in the memory exporter + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + callback_handler=None, + ) + agent_response = agent(case.input) + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(agent_response), "trajectory": session} + + +# 2. Create test cases with expected_assertion +# When expected_assertion is provided, the evaluator uses assertion mode: +# it judges whether the agent's response is correct by comparing it to the expected assertion, +# using a binary CORRECT/INCORRECT rubric rather than the 3-level basic rubric. +test_cases = [ + Case[str, str]( + name="math-1", + input="What is 25 * 4?", + expected_assertion="The agent should return the correct answer of 100.", + ), + Case[str, str]( + name="math-2", + input="Calculate the square root of 144", + expected_assertion="The agent should return the correct answer of 12.", + ), +] + +# 3. Create evaluators +evaluators = [CorrectnessEvaluator()] + +# 4. Create an experiment +experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators) + +# 5. Run evaluations +reports = experiment.run_evaluations(user_task_function) +reports[0].run_display() diff --git a/docs/examples/evals-sdk/goal_success_rate_with_assertions_evaluator.py b/docs/examples/evals-sdk/goal_success_rate_with_assertions_evaluator.py new file mode 100644 index 000000000..4b387b9b0 --- /dev/null +++ b/docs/examples/evals-sdk/goal_success_rate_with_assertions_evaluator.py @@ -0,0 +1,53 @@ +from strands import Agent + +from strands_evals import Case, Experiment +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# Setup telemetry +telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter() +memory_exporter = telemetry.in_memory_exporter + +# 1. Define a task function +def user_task_function(case: Case) -> dict: + agent = Agent( + # IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper + # to prevent spans from different test cases from being mixed together in the memory exporter + trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id}, + callback_handler=None, + ) + agent_response = agent(case.input) + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id=case.session_id) + + return {"output": str(agent_response), "trajectory": session} + + +# 2. Create test cases with expected_assertion +# When expected_assertion is provided, the evaluator uses assertion mode: +# it judges whether the agent's behavior satisfies the specified success assertions +# rather than inferring goals from the conversation. +test_cases = [ + Case[str, str]( + name="math-1", + input="What is 25 * 4?", + expected_assertion="The agent should return the correct answer of 100.", + ), + Case[str, str]( + name="math-2", + input="Calculate the square root of 144", + expected_assertion="The agent should return the correct answer of 12.", + ), +] + +# 3. Create evaluators +evaluators = [GoalSuccessRateEvaluator()] + +# 4. Create an experiment +experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators) + +# 5. Run evaluations +reports = experiment.run_evaluations(user_task_function) +reports[0].run_display()