### Initializing Project Client

In [None]:
import os
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from azure.ai.agents.models import ListSortOrder
from dotenv import load_dotenv

load_dotenv()

# Initialize client
project = AIProjectClient(
    credential=DefaultAzureCredential(),
    endpoint=os.environ["AZURE_AI_PROJECT"])

agent_id = os.environ["AGENT_ID"]
agent = project.agents.get_agent(agent_id)
print(f"Connected to agent, ID: {agent.id}")

# thread = project.agents.threads.create()
# print(f"Created thread, ID: {thread.id}")

Connected to agent, ID: asst_GHYn52a8aVGYJehtyWYjuDGw


### Get data from agent

In [None]:
# enter thread id here
thread_id = 'thread_id'

In [3]:
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter that will be backed by the project.
converter = AIAgentConverter(project)

# Specify a file path to save the agent output (evaluation input data) to.
filename = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")

evaluation_data = converter.prepare_evaluation_data(thread_ids=thread_id, filename=filename) 

print(f"Evaluation data saved to {filename}")

# print(json.dumps(evaluation_data, indent=4))

Class AIAgentConverter: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FDPAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AIAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Evaluation data saved to /Users/rivyesch/Dev/eval-azure-foundry/evaluation_input_data.jsonl


## Post-process the converter output data

In [8]:
import importlib
import postprocess_evaluation_jsonl as pp
importlib.reload(pp)
from postprocess_evaluation_jsonl import postprocess

in_path = os.path.join(os.getcwd(), "evaluation_input_data.jsonl")
out_dir = os.getcwd()

postprocessed = postprocess(in_path, out_dir, rag_snippets_k=5, snippet_max_chars=2000, debug=True)
pprint(postprocessed)


=== Processing item 0 ===
Top-level keys: ['query', 'response', 'tool_definitions']
Query extracted: True
Response extracted: True
Query preview: hi...
Response preview: Hi there, you're through to ZenBot. What can I help you with today?...
Explicit context found: False
Context from tool results: True
Number of context snippets: 5
First snippet preview: [Source: KB0013005 Printing using uniFLOW Printer.md]
# Printing using uniFLOW Printer

This article guides you to print a file using uniFLOW and release print jobs at the physical uniFLOW-enabled p...
✓ Added to RAG rows

=== Processing item 1 ===
Top-level keys: ['query', 'response', 'tool_definitions']
Query extracted: True
Response extracted: True
Query preview: How do I print a file using uniFLOW from my computer?...
Response preview: No worries, I can walk you through printing with uniFLOW. First up, just open the file you want to p...
Explicit context found: False
Context from tool results: True
Number of context snippets: 5
Fir

### Setting up evaluator

We will select the following evaluators to assess the different aspects relevant for agent quality: 

- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.
- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.
- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.


In [None]:
from azure.ai.evaluation import (
    ToolCallAccuracyEvaluator,
    AzureOpenAIModelConfiguration,
    IntentResolutionEvaluator,
    TaskAdherenceEvaluator,
    RetrievalEvaluator,
    DocumentRetrievalEvaluator,
    GroundednessEvaluator,
    RelevanceEvaluator,
    ResponseCompletenessEvaluator,
    CoherenceEvaluator,
    FluencyEvaluator,
    QAEvaluator,
    HateUnfairnessEvaluator,
    SexualEvaluator,
    ViolenceEvaluator,
    SelfHarmEvaluator,
    ProtectedMaterialEvaluator,
    IndirectAttackEvaluator,
)
from pprint import pprint

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["MODEL_DEPLOYMENT_NAME"],
)
# Needed to use content safety evaluators
azure_ai_project = os.environ["AZURE_AI_PROJECT"]
credential = DefaultAzureCredential()

intent_resolution_eval = IntentResolutionEvaluator(model_config=model_config)
tool_call_accuracy_eval = ToolCallAccuracyEvaluator(model_config=model_config)
task_adherence_eval = TaskAdherenceEvaluator(model_config=model_config)

retrieval_eval = RetrievalEvaluator(model_config=model_config)
groundedness_eval = GroundednessEvaluator(model_config=model_config)
relevance_eval = RelevanceEvaluator(model_config=model_config)
# response_completeness_eval = ResponseCompletenessEvaluator(model_config=model_config)

coherence_eval = CoherenceEvaluator(model_config=model_config)
fluency_eval = FluencyEvaluator(model_config=model_config)
# qa_eval = QAEvaluator(model_config=model_config)

hate_unfairness_eval = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=credential, threshold=3) 
sexual_eval = SexualEvaluator(azure_ai_project=azure_ai_project, credential=credential, threshold=3) 
violence_eval = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=credential, threshold=3) 
self_harm_eval = SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=credential, threshold=3) 
content_safety_eval = ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=credential, threshold=3) 
protected_material_eval = ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=credential)
direct_attack_simulator_eval = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=credential)
indirect_attack_eval = IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=credential)
code_vulnerability_eval = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project, credential=credential) 


# # The min and max of the label scores are inputs to document retrieval evaluator
# ground_truth_label_min = 0
# ground_truth_label_max = 4

# document_retrieval_eval = DocumentRetrievalEvaluator(
#     # Specify the ground truth label range
#     ground_truth_label_min=ground_truth_label_min, 
#     ground_truth_label_max=ground_truth_label_max,
#     # Optionally override the binarization threshold for pass/fail output
#     ndcg_threshold = 0.5,
#     xdcg_threshold = 50.0,
#     fidelity_threshold = 0.5,
#     top1_relevance_threshold = 50.0,
#     top3_max_relevance_threshold = 50.0,
#     total_retrieved_documents_threshold = 50,
#     total_ground_truth_documents_threshold = 50
# )

Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ResponseCompletenessEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class HateUnfairnessEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


In [None]:
# Evaluate by group using trimmed inputs (keeps runs small and targeted)
from azure.ai.evaluation import evaluate

out_dir = os.getcwd()

def eval_group(file_path: str, evaluators: dict, name: str):
    print(f"\n=== Running {name} on {os.path.basename(file_path)} ===")
    resp = evaluate(
        data=file_path,
        evaluators=evaluators,
        azure_ai_project=azure_ai_project,
    )
    print(f"Studio URL: {resp.get('studio_url')}")
    return resp

# General Purpose and Safety & Security (uses only query/response)
general_file = os.path.join(out_dir, "general_qa.jsonl")
if os.path.exists(general_file):
    _ = eval_group(
        general_file,
        {
            "fluency": fluency_eval,
            "coherence": coherence_eval,
            "hate_unfairness": hate_unfairness_eval,
            "sexual": sexual_eval,
            "violence": violence_eval,
            "self_harm": self_harm_eval,
            "content_safety": content_safety_eval,
            "protected_material": protected_material_eval,
            "direct_attack_simulator": direct_attack_simulator_eval,
            "indirect_attack": indirect_attack_eval,
            "code_vulnerability": code_vulnerability_eval,
        },
        name="general_qa",
    )

# Agent basics (query, response, tool_definitions/tool_calls)
agent_file = os.path.join(out_dir, "agent_basic.jsonl")
if os.path.exists(agent_file):
    # Run evaluators sequentially to reduce concurrency
    agent_evaluators = [
        ("intent_resolution", intent_resolution_eval),
        ("task_adherence", task_adherence_eval),
        ("tool_call_accuracy", tool_call_accuracy_eval),
    ]
    for eval_name, evaluator in agent_evaluators:
        _ = eval_group(
            agent_file,
            {eval_name: evaluator},
            name=f"agent_basic_{eval_name}",
        )
        time.sleep(1)  # Add a 1-second delay between evaluator runs to avoid rate limits

# RAG core (query, response, context)
rag_file = os.path.join(out_dir, "rag_core.jsonl")
if os.path.exists(rag_file):
    # Run evaluators sequentially to reduce concurrency
    rag_evaluators = [
        ("retrieval", retrieval_eval),
        ("groundedness", groundedness_eval),
        ("relevance", relevance_eval),
    ]
    for eval_name, evaluator in rag_evaluators:
        _ = eval_group(
            rag_file,
            {eval_name: evaluator},
            name=f"rag_core_{eval_name}",
        )
        time.sleep(1)  # Add a 1-second delay between evaluator runs to avoid rate limits
        
# # Agent basics (query, response, tool_definitions/tool_calls)
# agent_file = os.path.join(out_dir, "agent_basic.jsonl")
# if os.path.exists(agent_file):
#     _ = eval_group(
#         agent_file,
#         {
#             "intent_resolution": intent_resolution,
#             "task_adherence": task_adherence,
#             "tool_call_accuracy": tool_call_accuracy,
#         },
#         name="agent_basic",
#     )

# # RAG core (query, response, context)
# rag_file = os.path.join(out_dir, "rag_core.jsonl")
# if os.path.exists(rag_file):
#     _ = eval_group(
#         rag_file,
#         {
#             "retrieval": retrieval,
#             "groundedness": groundedness,
#             "relevance": relevance,
#         },
#         name="rag_core",
#     )

# Document Retrieval (needs ground truth and retrieved documents)
docret_file = os.path.join(out_dir, "document_retrieval.jsonl")
if os.path.exists(docret_file):
    _ = eval_group(
        docret_file,
        {"document_retrieval": document_retrieval},
        name="document_retrieval",
    )

# Response Completeness (needs ground truth)
respcomp_file = os.path.join(out_dir, "response_completeness.jsonl")
if os.path.exists(respcomp_file):
    _ = eval_group(
        respcomp_file,
        {"response_completeness": response_completeness},
        name="response_completeness",
    )


=== Running safety_security on safety_security.jsonl ===
2025-10-30 11:26:09 +0800 6296760320 execution.bulk     INFO     Finished 8 / 18 lines.
2025-10-30 11:26:09 +0800 6296760320 execution.bulk     INFO     Average execution time for completed lines: 5.02 seconds. Estimated time for incomplete lines: 50.2 seconds.
2025-10-30 11:26:11 +0800 6296760320 execution.bulk     INFO     Finished 16 / 18 lines.
2025-10-30 11:26:11 +0800 6296760320 execution.bulk     INFO     Average execution time for completed lines: 2.63 seconds. Estimated time for incomplete lines: 5.26 seconds.
2025-10-30 11:26:16 +0800 6296760320 execution.bulk     INFO     Finished 17 / 18 lines.
2025-10-30 11:26:16 +0800 6296760320 execution.bulk     INFO     Average execution time for completed lines: 2.74 seconds. Estimated time for incomplete lines: 2.74 seconds.
2025-10-30 11:26:16 +0800 6296760320 execution.bulk     INFO     Finished 18 / 18 lines.
2025-10-30 11:26:16 +0800 6296760320 execution.bulk     INFO     

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "hate_unfairness_20251030_032529_750345"
Run status: "Completed"
Start time: "2025-10-30 03:25:29.750345+00:00"
Duration: "0:00:47.577615"


{
    "hate_unfairness": {
        "status": "Completed",
        "duration": "0:00:47.577615",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    }
}


Studio URL: None

=== Running agent_basic on agent_basic.jsonl ===
2025-10-30 11:26:24 +0800 13069578240 execution.bulk     INFO     Finished 18 / 18 lines.
2025-10-30 11:26:24 +0800 13069578240 execution.bulk     INFO     Average execution time for completed lines: 0.0 seconds. Estimated time for incomplete lines: 0.0 seconds.

Are you seeing that printer in your list?

Are you seeing that printer in your list?

Once you've done that, you'll need to head over to any King Street Studios printer. Are you near one now, or do you want to know what to do when you get there?

Once you've done that, you'll need to head over to any King Street Studios printe

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "intent_resolution_20251030_032624_097495"
Run status: "Completed"
Start time: "2025-10-30 03:26:24.097495+00:00"
Duration: "0:00:04.004214"

2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Finished 11 / 18 lines.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Average execution time for completed lines: 0.38 seconds. Estimated time for incomplete lines: 2.66 seconds.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Finished 12 / 18 lines.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Average execution time for completed lines: 0.36 seconds. Estimated time for incomplete lines: 2.16 seconds.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Finished 13 / 18 lines.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk     INFO     Average execution time for completed lines: 0.33 seconds. Estimated time for incomplete lines: 1.65 seconds.
2025-10-30 11:26:28 +0800 13052751872 execution.bulk

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "task_adherence_20251030_032624_099802"
Run status: "Completed"
Start time: "2025-10-30 03:26:24.099802+00:00"
Duration: "0:00:05.007529"


{
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:00:04.004214",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:00:05.007529",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:00:01.001499",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    }
}


Studio URL: None

=== Running rag_core on rag_core.jsonl ===
2025-10-30 11:26:36 +0800 13103230976 execution.bulk     INFO     Finished 3 / 18 lines.
2025-10-30 11:26:36 +0800 13103230976 execution.bulk     INFO     Average execution time for completed lines: 0.59 seconds. Estimated time

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "retrieval_20251030_032634_738033"
Run status: "Completed"
Start time: "2025-10-30 03:26:34.738033+00:00"
Duration: "0:00:55.081521"


{
    "retrieval": {
        "status": "Completed",
        "duration": "0:00:55.081521",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "groundedness": {
        "status": "Completed",
        "duration": "0:00:54.078665",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "relevance": {
        "status": "Completed",
        "duration": "0:00:05.006573",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    }
}


Studio URL: None


### Run Evaluator

In [6]:
from azure.ai.evaluation import evaluate

file_name = "evaluation_input_data.jsonl"

response = evaluate(
    data=file_name,
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence,
    },
    azure_ai_project=azure_ai_project,
)
pprint(f'AI Foundary URL: {response.get("studio_url")}')



2025-10-29 01:02:01 +0800 6238679040 execution.bulk     INFO     Finished 14 / 18 lines.
2025-10-29 01:02:01 +0800 6238679040 execution.bulk     INFO     Average execution time for completed lines: 0.01 seconds. Estimated time for incomplete lines: 0.04 seconds.
Traceback (most recent call last):
  File "/Users/rivyesch/Dev/eval-azure-foundry/.venv/lib/python3.13/site-packages/azure/ai/evaluation/_legacy/prompty/_prompty.py", line 382, in _send_with_retries
    response = await client.chat.completions.create(**params)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rivyesch/Dev/eval-azure-foundry/.venv/lib/python3.13/site-packages/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py", line 50, in async_wrapper
    result: _WithUsage = await method(*args, **kwargs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rivyesch/Dev/eval-azure-foundry/.venv/lib/python3.13/site-packages/openai/resources/chat/completions/completions.py",

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "tool_call_accuracy_20251028_170201_034418"
Run status: "Completed"
Start time: "2025-10-28 17:02:01.034418+00:00"
Duration: "0:01:08.072617"


{
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:01:08.072617",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:01:05.088401",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:01:05.068120",
        "completed_lines": 18,
        "failed_lines": 0,
        "log_path": null
    }
}


('AI Foundary URL: '
 'https://ai.azure.com/resource/build/evaluation/5e6a34ea-78c8-48b0-9557-0c716ac916d2?wsid=/subscriptions/8bc573e3-0006-4c57-99eb-fe732fc4e022/resourceGroups/keyreplyopenai/providers/Microsoft.CognitiveServices/accounts/prabh-mfb885mz-swedencentral/projects/prabh

{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}
{'agent_basic': '/Users/rivyesch/Dev/eval-azure-foundry/agent_basic.jsonl'}


## Inspect results on Azure AI Foundry

Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve.

In [29]:
# alternatively, you can use the following to get the evaluation results in memory

# average scores across all runs
pprint(response["metrics"])