https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk

### Create a custom Python function and register it in the Toolset

In [9]:
import os
import json
import requests
from azure.ai.agents.models import FunctionTool, ToolSet
from typing import Set, Callable, Any
from dotenv import load_dotenv

load_dotenv('../../.env')

weather_api_key = os.environ.get("WEATHER_API_KEY")

# Define a custom Python function.
def get_weather(city: str) -> str:
    """
    Fetches the weather information for the specified location.

    :param location (str): The location to fetch weather for.
    :return: Weather information as a JSON string.
    :rtype: str
    """
    url = f"http://api.weatherapi.com/v1/current.json?key={weather_api_key}&q={city}&aqi=no"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        condition = data["current"]["condition"]["text"]
        temp_c = data["current"]["temp_c"]
        return {"condition": condition,"temperature": temp_c}
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}
    
user_functions: Set[Callable[..., Any]] = {
    get_weather,
}

# Add tools that the agent will use. 
functions = FunctionTool(user_functions)

toolset = ToolSet()
toolset.add(functions)

### Create the agent, iniate the thread, add message and run

In [10]:
import os
import azure.ai.agents 
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

load_dotenv('../../.env')

# Create an Azure AI Client from an endpoint, copied from your Azure AI Foundry project.
project_endpoint = os.environ["AI_PROJECT_ENDPOINT"]  # Ensure the PROJECT_ENDPOINT environment variable is set

# Create an AIProjectClient instance
project_client = AIProjectClient(
    endpoint=project_endpoint,
    credential=DefaultAzureCredential(),  # Use Azure Default Credential for authentication
)

# Create an agent with the toolset 
agent = project_client.agents.create_agent(
    model="gpt-4o",  # Model deployment name
    name="my-agent-to-be-evaluated",  # Name of the agent
    instructions="You are a helpful agent",  # Instructions for the agent
    toolset=toolset
)
print(f"Created agent, ID: {agent.id}")

# Enable auto function calls for the agent
project_client.agents.enable_auto_function_calls(toolset)

# Create a thread for communication
thread = project_client.agents.threads.create()
print(f"Created thread, ID: {thread.id}")

# Add a message to the thread
message = project_client.agents.messages.create(
    thread_id=thread.id,
    role="user",  # Role of the message sender
    content="What is the weather in Tokyo today?",  # Message content
)
print(f"Created message, ID: {message['id']}")

# Create and process an agent run
run = project_client.agents.runs.create_and_process(thread_id=thread.id, agent_id=agent.id)
print(f"Run finished with status: {run.status}")

# Check if the run failed
if run.status == "failed":
    print(f"Run failed: {run.last_error}")

# Fetch and log all messages
messages = project_client.agents.messages.list(thread_id=thread.id,order="asc")
for message in messages:
    print(f"Role: {message['role']}")
    print(f"Content: {message['content']}")
    print("-" * 40)


### Get the converted data from the run/thread id

In [11]:
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter for Azure AI agents.
converter = AIAgentConverter(project_client)

# Specify the thread and run ID.
thread_id = thread.id
run_id = run.id

converted_data = converter.convert(thread_id, run_id)

### Evaluate a single agent run

In [12]:
# This is specific to agentic workflows.
from azure.ai.evaluation import IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator

import os
from dotenv import load_dotenv
load_dotenv()

model_config = {
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
    "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
}

# Evaluators with standard model support
quality_evaluators = {evaluator.__name__: evaluator(model_config=model_config) for evaluator in [IntentResolutionEvaluator, TaskAdherenceEvaluator, ToolCallAccuracyEvaluator]}

# Reference the quality and safety evaluator list above.
quality_and_safety_evaluators = {**quality_evaluators}

for name, evaluator in quality_and_safety_evaluators.items():
    result = evaluator(**converted_data)
    print(name)
    print(json.dumps(result, indent=4)) 

### Evaluate multiple agent runs or threads

First, convert your agent thread data into a file via our converter support:

In [13]:
# Specify a file path to save the agent output (evaluation input data) to.
filename = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")

evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=filename) 

print(f"Evaluation data saved to {filename}")

Leverage the Batch evaluate API for asynchronous evaluation.

In [None]:
import os
from dotenv import load_dotenv
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ToolCallAccuracyEvaluator,
    IntentResolutionEvaluator,
    TaskAdherenceEvaluator,
)

load_dotenv()
ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

intent_resolution = IntentResolutionEvaluator(model_config=model_config)
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

response = evaluate(
    data=filename,
    evaluation_name="agent demo - batch run",
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence,
    },
    # optionally, log your results to your Azure AI Foundry project for rich visualization 
    azure_ai_project = ai_project_endpoint
)
# Inspect the average scores at a high level.
print(response["metrics"])

# Use the URL to inspect the results on the UI.
print(f'AI Foundry URL: {response.get("studio_url")}')

Created agent, ID: asst_72W0slNHmZUlo0rpoGDrjI5y
Created thread, ID: thread_UzwYxcUKqf7GND5eYtl2Fd3H
Created message, ID: msg_eLeoQ6QYU1Wfa3aEc0yEVN9l
Run finished with status: RunStatus.COMPLETED
Role: user
Content: [{'type': 'text', 'text': {'value': 'What is the weather in Tokyo today?', 'annotations': []}}]
----------------------------------------
Role: assistant
Content: [{'type': 'text', 'text': {'value': 'The weather in Tokyo today is clear, with a temperature of 28.0°C.', 'annotations': []}}]
----------------------------------------
IntentResolutionEvaluator
{
    "intent_resolution": 5.0,
    "intent_resolution_result": "pass",
    "intent_resolution_threshold": 3,
    "intent_resolution_reason": "User wanted the current weather in Tokyo. Agent provided a clear, accurate response with temperature and conditions, fully resolving the intent without any gaps or issues."
}
TaskAdherenceEvaluator
{
    "task_adherence": 5.0,
    "task_adherence_result": "pass",
    "task_adherence_