https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk

### Create a custom Python function and register it in the Toolset

In [4]:
from azure.ai.agents.models import FunctionTool, ToolSet
from typing import Set, Callable, Any
import json

# Define a custom Python function.
def fetch_weather(location: str) -> str:
    """
    Fetches the weather information for the specified location.

    :param location (str): The location to fetch weather for.
    :return: Weather information as a JSON string.
    :rtype: str
    """
    # In a real-world scenario, you'd integrate with a weather API.
    # In the following code snippet, we mock the response.
    mock_weather_data = {"Seattle": "Sunny, 25°C", "London": "Cloudy, 18°C", "Tokyo": "Rainy, 22°C"}
    weather = mock_weather_data.get(location, "Weather data not available for this location.")
    weather_json = json.dumps({"weather": weather})
    return weather_json

user_functions: Set[Callable[..., Any]] = {
    fetch_weather,
}

# Add tools that the agent will use. 
functions = FunctionTool(user_functions)

toolset = ToolSet()
toolset.add(functions)

### Create the agent, iniate the thread, add message and run

In [5]:
import os
import azure.ai.agents 
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

load_dotenv('../../.env')

# Create an Azure AI Client from an endpoint, copied from your Azure AI Foundry project.
project_endpoint = os.environ["AI_PROJECT_ENDPOINT"]  # Ensure the PROJECT_ENDPOINT environment variable is set

# Create an AIProjectClient instance
project_client = AIProjectClient(
    endpoint=project_endpoint,
    credential=DefaultAzureCredential(),  # Use Azure Default Credential for authentication
)

# Create an agent with the toolset 
agent = project_client.agents.create_agent(
    model="gpt-4o",  # Model deployment name
    name="my-agent-to-be-evaluated",  # Name of the agent
    instructions="You are a helpful agent",  # Instructions for the agent
    toolset=toolset
)
print(f"Created agent, ID: {agent.id}")

# Enable auto function calls for the agent
project_client.agents.enable_auto_function_calls(toolset)

# Create a thread for communication
thread = project_client.agents.threads.create()
print(f"Created thread, ID: {thread.id}")

# Add a message to the thread
message = project_client.agents.messages.create(
    thread_id=thread.id,
    role="user",  # Role of the message sender
    content="What is the weather in Tokyo today?",  # Message content
)
print(f"Created message, ID: {message['id']}")

# Create and process an agent run
run = project_client.agents.runs.create_and_process(thread_id=thread.id, agent_id=agent.id)
print(f"Run finished with status: {run.status}")

# Check if the run failed
if run.status == "failed":
    print(f"Run failed: {run.last_error}")

# Fetch and log all messages
messages = project_client.agents.messages.list(thread_id=thread.id,order="asc")
for message in messages:
    print(f"Role: {message['role']}")
    print(f"Content: {message['content']}")
    print("-" * 40)


Created agent, ID: asst_Dijx1EyMEpnp7SsUuOldRTKH
Created thread, ID: thread_6cmoMfK4qazW7MJztGOrSzk8
Created message, ID: msg_mcNTtQoSxXwug3Ju1UrebhZE
Run finished with status: RunStatus.COMPLETED
Role: user
Content: [{'type': 'text', 'text': {'value': 'What is the weather in Tokyo today?', 'annotations': []}}]
----------------------------------------
Role: assistant
Content: [{'type': 'text', 'text': {'value': 'The weather in Tokyo today is rainy with a temperature of 22°C.', 'annotations': []}}]
----------------------------------------


### Get the converted data from the run/thread id

In [6]:
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter for Azure AI agents.
converter = AIAgentConverter(project_client)

# Specify the thread and run ID.
thread_id = thread.id
run_id = run.id

converted_data = converter.convert(thread_id, run_id)

Class AIAgentConverter: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FDPAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AIAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


### Evaluate a single agent run

In [7]:
# This is specific to agentic workflows.
from azure.ai.evaluation import IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator

import os
from dotenv import load_dotenv
load_dotenv()

model_config = {
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
    "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
}

# Evaluators with standard model support
quality_evaluators = {evaluator.__name__: evaluator(model_config=model_config) for evaluator in [IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator]}

# Reference the quality and safety evaluator list above.
quality_and_safety_evaluators = {**quality_evaluators}

for name, evaluator in quality_and_safety_evaluators.items():
    result = evaluator(**converted_data)
    print(name)
    print(json.dumps(result, indent=4)) 

Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


IntentResolutionEvaluator
{
    "intent_resolution": 5.0,
    "intent_resolution_result": "pass",
    "intent_resolution_threshold": 3,
    "intent_resolution_reason": "User wanted to know the weather in Tokyo today. Agent provided an accurate and complete response, including both the condition (rainy) and temperature (22\u00b0C), fully resolving the intent."
}
TaskAdherenceEvaluator
{
    "task_adherence": 5.0,
    "task_adherence_result": "pass",
    "task_adherence_threshold": 3,
    "task_adherence_reason": "The assistant correctly identified the task, used the appropriate tool, and provided a clear and accurate weather update for Tokyo, fully satisfying the user's request."
}
ToolCallAccuracyEvaluator
{
    "tool_call_accuracy": 5.0,
    "tool_call_accuracy_result": "pass",
    "tool_call_accuracy_threshold": 3,
    "tool_call_accuracy_reason": "Let's think step by step: The user asked for the weather in Tokyo today. The agent correctly called the 'fetch_weather' tool with the app

### Evaluate multiple agent runs or threads

First, convert your agent thread data into a file via our converter support:

In [8]:
# Specify a file path to save the agent output (evaluation input data) to.
filename = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")

evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=filename) 

print(f"Evaluation data saved to {filename}")

Evaluation data saved to c:\Users\ruplisso\Documents\GitHub\genaiops-exercises2\solutions\06_Agents_Evaluator\evaluation_input_data.jsonl


Leverage the Batch evaluate API for asynchronous evaluation.

In [9]:
import os
from dotenv import load_dotenv
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ToolCallAccuracyEvaluator,
    IntentResolutionEvaluator,
    TaskAdherenceEvaluator,
)

load_dotenv()
ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

intent_resolution = IntentResolutionEvaluator(model_config=model_config)
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

response = evaluate(
    data=filename,
    evaluation_name="agent demo - batch run",
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence,
    },
    # optionally, log your results to your Azure AI Foundry project for rich visualization 
    azure_ai_project = ai_project_endpoint
)
# Inspect the average scores at a high level.
print(response["metrics"])

# Use the URL to inspect the results on the UI.
print(f'AI Foundry URL: {response.get("studio_url")}')

2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Finished 1 / 7 lines.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Average execution time for completed lines: 10.32 seconds. Estimated time for incomplete lines: 61.92 seconds.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Finished 2 / 7 lines.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Average execution time for completed lines: 5.17 seconds. Estimated time for incomplete lines: 25.85 seconds.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Finished 3 / 7 lines.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Average execution time for completed lines: 3.45 seconds. Estimated time for incomplete lines: 13.8 seconds.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Finished 4 / 7 lines.
2025-08-05 14:36:57 +0200    2364 execution.bulk     INFO     Average execution time for completed lines: 2.59 seconds. Estimated time for incomplet

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "tool_call_accuracy_20250805_123647_183602"
Run status: "Completed"
Start time: "2025-08-05 12:36:47.183602+00:00"
Duration: "0:00:13.774186"


{
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:00:13.774186",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null
    },
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:00:10.962442",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:00:11.922411",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null
    }
}


{'tool_call_accuracy.tool_call_accuracy': 5.0, 'tool_call_accuracy.tool_call_accuracy_threshold': 3.0, 'intent_resolution.intent_resolution': 5.0, 'intent_resolution.intent_resolution_threshold': 3.0, 'task_adherence.task_adherence': 5.0, 'task_adherence.task_adherence_threshold': 3.0, 