

https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk

In [None]:
import os
import json
import time
from azure.identity import DefaultAzureCredential
from azure.ai.projects import AIProjectClient
from azure.ai.agents.models import FunctionTool
from azure.ai.evaluation import AIAgentConverter, IntentResolutionEvaluator
from dotenv import load_dotenv

load_dotenv('../../.env')

# Initialize the AI Project Endpoint from environment variables
ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

# Create a fake fetch_weather function
def fetch_weather(location: str) -> str:
    """
    Fetches the weather information for the specified location.

    :param location: The location to fetch weather for.
    :return: Weather information as a JSON string.
    """
    # Mock weather data for demonstration purposes
    mock_weather_data = {"New York": "Sunny, 25°C", "London": "Cloudy, 18°C", "Tokyo": "Rainy, 22°C"}
    weather = mock_weather_data.get(location, "Weather data not available for this location.")
    return json.dumps({"weather": weather})

# Define user functions
user_functions = {fetch_weather}

# Initialize the AIProjectClient
project_client = AIProjectClient(
    endpoint=ai_project_endpoint,
    credential=DefaultAzureCredential()
)

# Initialize the FunctionTool with user-defined functions
functions = FunctionTool(functions=user_functions)

with project_client:
    
    # Create an agent with custom functions
    agent = project_client.agents.create_agent(
        model="gpt-4o",
        name="my-agent-to-be-evaluated",
        instructions="You are a helpful agent",
        tools=functions.definitions,
    )
    print(f"Created agent, ID: {agent.id}")

    # Create a thread for communication
    thread = project_client.agents.threads.create()
    print(f"Created thread, ID: {thread.id}")

    # Send a message to the thread
    message = project_client.agents.messages.create(
        thread_id=thread.id,
        role="user",
        content="Hello, what about the weather in New York?",
    )
    print(f"Created message, ID: {message['id']}")

    run = project_client.agents.runs.create(thread_id=thread.id, agent_id=agent.id)
    print(f"Created run, ID: {run.id}")

    # Poll the run status until it is completed or requires action
    while run.status in ["queued", "in_progress", "requires_action"]:
        time.sleep(1)
        run = project_client.agents.runs.get(thread_id=thread.id, run_id=run.id)

        if run.status == "requires_action":
            tool_calls = run.required_action.submit_tool_outputs.tool_calls
            tool_outputs = []
            for tool_call in tool_calls:
                if tool_call.function.name == "fetch_weather":
                    output = fetch_weather("New York")
                    tool_outputs.append({"tool_call_id": tool_call.id, "output": output})
            project_client.agents.runs.submit_tool_outputs(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)

    print(f"Run completed with status: {run.status}")

    # Fetch and log all messages from the thread
    messages = project_client.agents.messages.list(thread_id=thread.id,order="asc")
    for message in messages:
        print(f"Role: {message['role']}")
        print(f"Content: {message['content']}")
        print("-" * 40)

    # Initialize the converter for Azure AI agents.
    converter = AIAgentConverter(project_client)

    # Specify the thread and run ID.
    thread_id = thread.id
    run_id = run.id
    converted_data = converter.convert(thread_id, run_id)

    # Specify a file path to save the agent output (evaluation input data) to.
    filename = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")
    evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=filename) 
    print(f"Evaluation data saved to {filename}")


Created agent, ID: asst_QNxGQefOZrxb8xknXjGGPcer
Created thread, ID: thread_WZ1lL3DLldHTDLqTX7Q21hUP
Created message, ID: msg_WvDdkXeip6CN6qKxBqDrTKmN
Created run, ID: run_vAhKjAJ75l5ev9loCKcTi50O
Run completed with status: RunStatus.COMPLETED
Role: user
Content: [{'type': 'text', 'text': {'value': 'Hello, what about the weather in New York?', 'annotations': []}}]
----------------------------------------
Role: assistant
Content: [{'type': 'text', 'text': {'value': 'The weather in New York is currently sunny with a temperature of 25°C.', 'annotations': []}}]
----------------------------------------


Class AIAgentConverter: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FDPAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AIAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Evaluation data saved to c:\Users\ruplisso\Documents\GitHub\genaiops-exercises2\solutions\06_Agents_Evaluator\evaluation_input_data.jsonl


Display Converted Data

In [3]:
converted_data

{'query': [{'role': 'system', 'content': 'You are a helpful agent'},
  {'createdAt': '2025-08-04T14:28:33Z',
   'role': 'user',
   'content': [{'type': 'text',
     'text': 'Hello, what about the weather in New York?'}]}],
 'response': [{'createdAt': '2025-08-04T14:28:36Z',
   'run_id': 'run_vAhKjAJ75l5ev9loCKcTi50O',
   'role': 'assistant',
   'content': [{'type': 'tool_call',
     'tool_call_id': 'call_t1BOebQ12iMQ7Jd3TqqmZE91',
     'name': 'fetch_weather',
     'arguments': {'location': 'New York'}}]},
  {'createdAt': '2025-08-04T14:28:39Z',
   'run_id': 'run_vAhKjAJ75l5ev9loCKcTi50O',
   'tool_call_id': 'call_t1BOebQ12iMQ7Jd3TqqmZE91',
   'role': 'tool',
   'content': [{'type': 'tool_result',
     'tool_result': {'weather': 'Sunny, 25°C'}}]},
  {'createdAt': '2025-08-04T14:28:40Z',
   'run_id': 'run_vAhKjAJ75l5ev9loCKcTi50O',
   'role': 'assistant',
   'content': [{'type': 'text',
     'text': 'The weather in New York is currently sunny with a temperature of 25°C.'}]}],
 'tool_def

In [4]:
# This is specific to agentic workflows.
from azure.ai.evaluation import IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator

# Other quality, risk, and safety metrics:
from azure.ai.evaluation import RelevanceEvaluator, CoherenceEvaluator, CodeVulnerabilityEvaluator, ContentSafetyEvaluator, IndirectAttackEvaluator, FluencyEvaluator
from azure.identity import DefaultAzureCredential

import os
from dotenv import load_dotenv
load_dotenv()

model_config = {
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
    "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
}

reasoning_model_config = {
    "azure_deployment": "o4-mini",
    "api_key": os.getenv("AZURE_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_ENDPOINT"),
    "api_version": os.getenv("AZURE_API_VERSION"),
}

# Evaluators with reasoning model support
quality_evaluators = {evaluator.__name__: evaluator(model_config=model_config) for evaluator in [IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator]}

## Using Azure AI Foundry (non-Hub) project endpoint, example: AZURE_AI_PROJECT=https://your-account.services.ai.azure.com/api/projects/your-project

# azure_ai_project = "https://projetagent-resource.services.ai.azure.com/api/projects/projetagent"

# Reference the quality and safety evaluator list above.
quality_and_safety_evaluators = {**quality_evaluators}

for name, evaluator in quality_and_safety_evaluators.items():
    result = evaluator(**converted_data)
    print(name)
    print(json.dumps(result, indent=4)) 

Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


IntentResolutionEvaluator
{
    "intent_resolution": 5.0,
    "intent_resolution_result": "pass",
    "intent_resolution_threshold": 3,
    "intent_resolution_reason": "User wanted the current weather in New York. Agent provided accurate and relevant information, including conditions and temperature, fully resolving the intent with clarity and precision."
}
TaskAdherenceEvaluator
{
    "task_adherence": 5.0,
    "task_adherence_result": "pass",
    "task_adherence_threshold": 3,
    "task_adherence_reason": "The assistant correctly identified the task, used the appropriate tool, and provided a clear and accurate weather update for New York."
}
ToolCallAccuracyEvaluator
{
    "tool_call_accuracy": 5.0,
    "tool_call_accuracy_result": "pass",
    "tool_call_accuracy_threshold": 3,
    "tool_call_accuracy_reason": "Let's think step by step: The user asked about the weather in New York. The agent correctly called the 'fetch_weather' tool with the parameter 'location' set to 'New York', wh

In [7]:
import os
from dotenv import load_dotenv
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ToolCallAccuracyEvaluator,
    IntentResolutionEvaluator,
    TaskAdherenceEvaluator,
)

load_dotenv()

ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

intent_resolution = IntentResolutionEvaluator(model_config=model_config)
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

response = evaluate(
    data=filename,
    evaluation_name="agent demo - batch run",
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence,
    },
    # optionally, log your results to your Azure AI Foundry project for rich visualization 
    azure_ai_project = ai_project_endpoint
    #"https://projetagent-resource.services.ai.azure.com/api/projects/projetagent", # example: https://your-account.services.ai.azure.com/api/projects/your-project
)
# Inspect the average scores at a high level.
print(response["metrics"])

# Use the URL to inspect the results on the UI.
print(f'AI Foundry URL: {response.get("studio_url")}')

2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Finished 1 / 3 lines.
2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Average execution time for completed lines: 3.89 seconds. Estimated time for incomplete lines: 7.78 seconds.
2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Finished 2 / 3 lines.
2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Average execution time for completed lines: 1.96 seconds. Estimated time for incomplete lines: 1.96 seconds.
2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Finished 3 / 3 lines.
2025-08-04 16:33:15 +0200   15556 execution.bulk     INFO     Average execution time for completed lines: 1.36 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-08-04 16:33:16 +0200   11444 execution.bulk     INFO     Finished 1 / 3 lines.
2025-08-04 16:33:16 +0200   11444 execution.bulk     INFO     Average execution time for completed lines: 4.31 seconds. Estimated time for incomplete li

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "tool_call_accuracy_20250804_143311_826648"
Run status: "Completed"
Start time: "2025-08-04 14:33:11.826648+00:00"
Duration: "0:00:07.524565"


{
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:00:07.524565",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    },
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:00:04.582877",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:00:04.540960",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    }
}


{'tool_call_accuracy.tool_call_accuracy': 5.0, 'tool_call_accuracy.tool_call_accuracy_threshold': 3.0, 'intent_resolution.intent_resolution': 5.0, 'intent_resolution.intent_resolution_threshold': 3.0, 'task_adherence.task_adherence': 5.0, 'task_adherence.task_adherence_threshold': 3.0, 