### Initializing Project Client

In [None]:
import os
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from azure.ai.agents.models import ListSortOrder
from dotenv import load_dotenv

load_dotenv()

# Initialize client
project = AIProjectClient(
    credential=DefaultAzureCredential(),
    endpoint=os.environ["AZURE_AI_PROJECT"])

agent_id = os.environ["AGENT_ID"]
agent = project.agents.get_agent(agent_id)
print(f"Connected to agent, ID: {agent.id}")

thread = project.agents.threads.create()
print(f"Created thread, ID: {thread.id}")

Connected to agent, ID: asst_GHYn52a8aVGYJehtyWYjuDGw
Created thread, ID: thread_EqEPlob7PsZqPGMIgZ0Z1nw0


### Conversation with Agent

In [2]:
MESSAGE = "What are the approved email domains for MS Teams verification when resetting a password?"

message = project.agents.messages.create(
    thread_id=thread.id,
    role="user",
    content= MESSAGE
)
print(f"Created message, ID: {message.id}")

run = project.agents.runs.create_and_process(
    thread_id=thread.id,
    agent_id=agent.id)

if run.status == "failed":
    print(f"Run failed: {run.last_error}")
else:
    print(f"Run finished with status: {run.status}")

    messages = project.agents.messages.list(thread_id=thread.id, order=ListSortOrder.ASCENDING)

    for message in messages:
        if message.text_messages:
            print(f"{message.role}: {message.text_messages[-1].text.value}")

    # for message in project.agents.messages.list(thread.id, order="asc"):
    #     print(f"Role: {message.role}")
    #     print(f"Content: {message.content[0].text.value}")
    #     print("-" * 40)

print(f"Run ID: {run.id}")


Created message, ID: msg_DO06rmom28Bs4qlHLIdZk9wq
Run finished with status: RunStatus.COMPLETED
MessageRole.USER: What are the approved email domains for MS Teams verification when resetting a password?
MessageRole.AGENT: Okay, no worries, I’ll just check our system for the approved email domains you can use for MS Teams verification during a password reset. Give me a sec while I look that up.
MessageRole.AGENT: All good, I've found the info for you. The only approved email domains for MS Teams verification when resetting a password are:

- @asahi.com.au
- @asahi.com.nz
- @asahibeverages.com

So, any Teams message used for verification must come from one of those internal domains. Let me know if you need anything else!【5:13†KB0013699 Password and MFA Reset - Process Changes and Secure Verification 1.md】【5:2†KB0013703 Password & MFA Reset Process - End User Article.md】
Run ID: run_skCnqavcps6lT3rgfKQDCKXY


### Get data from agent

In [18]:
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter that will be backed by the project.
converter = AIAgentConverter(project)

thread_id = thread.id
run_id = run.id
file_name = "evaluation_data.jsonl"

# Get a single agent run data
evaluation_data_single_run = converter.convert(thread_id=thread_id, run_id=run_id)

In [None]:
# Run this to save thread data to a JSONL file for evaluation
# Save the agent thread data to a JSONL file

import json

evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=file_name)
# print(json.dumps(evaluation_data, indent=4))

### Setting up evaluator

We will select the following evaluators to assess the different aspects relevant for agent quality: 

- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.
- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.
- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.


In [None]:
from azure.ai.evaluation import (
    ToolCallAccuracyEvaluator,
    AzureOpenAIModelConfiguration,
    IntentResolutionEvaluator,
    TaskAdherenceEvaluator,
    RetrievalEvaluator,
    DocumentRetrievalEvaluator,
    GroundednessEvaluator,
    RelevanceEvaluator,
    ResponseCompletenessEvaluator,
    CoherenceEvaluator,
    FluencyEvaluator,
    QAEvaluator,
    HateUnfairnessEvaluator,
    SexualEvaluator,
    ViolenceEvaluator,
    SelfHarmEvaluator,
    ProtectedMaterialEvaluator,
    IndirectAttackEvaluator,
    DirectAttackEvaluator
)
from pprint import pprint

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["MODEL_DEPLOYMENT_NAME"],
)
# Needed to use content safety evaluators
azure_ai_project = os.environ["AZURE_AI_PROJECT"]

intent_resolution = IntentResolutionEvaluator(model_config=model_config)
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

retrieval = RetrievalEvaluator(model_config=model_config)
groundedness = GroundednessEvaluator(model_config=model_config)
relevance = RelevanceEvaluator(model_config=model_config)
response_completeness = ResponseCompletenessEvaluator(model_config=model_config)

coherence = CoherenceEvaluator(model_config=model_config)
fluency = FluencyEvaluator(model_config=model_config)
qa = QAEvaluator(model_config=model_config)



In [31]:
# The min and max of the label scores are inputs to document retrieval evaluator
ground_truth_label_min = 0
ground_truth_label_max = 4

document_retrieval = DocumentRetrievalEvaluator(
    # Specify the ground truth label range
    ground_truth_label_min=ground_truth_label_min, 
    ground_truth_label_max=ground_truth_label_max,
    # Optionally override the binarization threshold for pass/fail output
    ndcg_threshold = 0.5,
    xdcg_threshold = 50.0,
    fidelity_threshold = 0.5,
    top1_relevance_threshold = 50.0,
    top3_max_relevance_threshold = 50.0,
    total_retrieved_documents_threshold = 50,
    total_ground_truth_documents_threshold = 50
)


### Run Evaluator

In [None]:
from azure.ai.evaluation import evaluate

file_name = "evaluation_data.jsonl"

response = evaluate(
    data=file_name,
    evaluators={
        # Layer 1: Agent evaluators
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence,
        "tool_call_accuracy": tool_call_accuracy,

        # Layer 2: RAG evaluators
        "retrieval": retrieval,
        "document_retrieval": document_retrieval,
        "groundedness": groundedness,
        "relevance": relevance,
        "response_completeness": response_completeness,

        # Layer 3: General purpose
        "coherence": coherence,
        "fluency": fluency,
        "qa": qa,
    },
    azure_ai_project=azure_ai_project,
)
pprint(f'AI Foundary URL: {response.get("studio_url")}')



## Inspect results on Azure AI Foundry

Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve.

In [29]:
# alternatively, you can use the following to get the evaluation results in memory

# average scores across all runs
pprint(response["metrics"])