In [None]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

env_var = {
    "gpt-4o": {
        "endpoint": os.environ.get("AOAI_GPT4O_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
    "gpt-4o-mini": {
        "endpoint": os.environ.get("AOAI_GPT4O_MINI_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
}

ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

In [2]:
import pandas as pd

df = pd.read_json("evaluation_dataset.jsonl", lines=True)
print(df.head())

                                               query  \
0               What event started on July 28, 1914?   
1      Who was the first person to walk on the moon?   
2  What was the significance of the year 1776 in ...   
3  Which wall fell in 1989, symbolizing the end o...   
4  What ancient city was buried by the eruption o...   

                                             context  \
0  It involved multiple countries and lasted unti...   
1  The event occurred during the Apollo 11 missio...   
2  A key document was signed declaring independen...   
3       It divided a German city into East and West.   
4  The city's ruins were rediscovered in the 18th...   

                      ground_truth  
0                      World War I  
1                   Neil Armstrong  
2  The Declaration of Independence  
3                  The Berlin Wall  
4                          Pompeii  


In [3]:
from azure.ai.evaluation import AzureOpenAIModelConfiguration

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
)

In [4]:
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import QAEvaluator
from model_endpoints import ModelEndpoints
import random

qa_evaluator = QAEvaluator(model_config)

models = [
    "gpt-4o",
    "gpt-4o-mini",
]

path = "./evaluation_dataset.jsonl"
randomNum = random.randint(1111, 9999)

for model in models:
    results = evaluate(
        evaluation_name="Eval-Run" + str(randomNum) + "-" + model.title(),
        data=path,
        target=ModelEndpoints(env_var, model),
        evaluators={
            "qa": qa_evaluator,
        },
        azure_ai_project=ai_project_endpoint,
        evaluator_config={
            "qa": {
                "column_mapping": {
                    "query": "${data.query}",
                    "context": "${data.context}",
                    "ground_truth": "${data.ground_truth}",
                    "response": "${target.response}",
                },
            },
        },
    )

# Use the URL to inspect the results on the UI.
print(f'AI Foundry URL: {results.get("studio_url")}')

2025-08-01 20:26:30 +0200   44404 execution.bulk     INFO     Finished 1 / 10 lines.
2025-08-01 20:26:30 +0200   44404 execution.bulk     INFO     Average execution time for completed lines: 1.69 seconds. Estimated time for incomplete lines: 15.21 seconds.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Finished 2 / 10 lines.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Average execution time for completed lines: 0.96 seconds. Estimated time for incomplete lines: 7.68 seconds.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Finished 5 / 10 lines.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Average execution time for completed lines: 0.44 seconds. Estimated time for incomplete lines: 2.2 seconds.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Finished 6 / 10 lines.
2025-08-01 20:26:31 +0200   44404 execution.bulk     INFO     Average execution time for completed lines: 0.4 seconds. Estimated time for incomplet

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "qa_20250801_182634_346587"
Run status: "Completed"
Start time: "2025-08-01 18:26:34.346587+00:00"
Duration: "0:00:55.253418"


{
    "qa": {
        "status": "Completed",
        "duration": "0:00:55.253418",
        "completed_lines": 10,
        "failed_lines": 0,
        "log_path": null
    }
}


2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Finished 1 / 10 lines.
2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Average execution time for completed lines: 1.16 seconds. Estimated time for incomplete lines: 10.44 seconds.
2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Finished 2 / 10 lines.
2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Average execution time for completed lines: 0.69 seconds. Estimated time for incomplete lines: 5.52 seconds.
2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Finished 3 / 10 lines.
2025-08-01 20:27:37 +0200   26416 execution.bulk     INFO     Average execution time fo

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "qa_20250801_182741_333195"
Run status: "Completed"
Start time: "2025-08-01 18:27:41.333195+00:00"
Duration: "0:00:42.849446"


{
    "qa": {
        "status": "Completed",
        "duration": "0:00:42.849446",
        "completed_lines": 10,
        "failed_lines": 0,
        "log_path": null
    }
}


AI Foundry URL: https://ai.azure.com/resource/build/evaluation/2e0c1bbb-3b92-482e-96b2-8fcca84627a7?wsid=/subscriptions/8babb7f9-50f7-498f-9e0a-8bef4389331d/resourceGroups/rg-ruplisso-3364/providers/Microsoft.CognitiveServices/accounts/projetagent-resource/projects/projetagent&tid=16b3c013-d300-468d-ac64-7eda0820b6d3


In [6]:
pd.DataFrame(results["rows"])

Unnamed: 0,inputs.query,inputs.context,inputs.ground_truth,inputs.line_number,outputs.query,outputs.response,outputs.qa.f1_score,outputs.qa.f1_result,outputs.qa.f1_threshold,outputs.qa.similarity,...,outputs.qa.gpt_fluency,outputs.qa.fluency_reason,outputs.qa.fluency_result,outputs.qa.fluency_threshold,outputs.qa.groundedness,outputs.qa.gpt_groundedness,outputs.qa.groundedness_reason,outputs.qa.groundedness_result,outputs.qa.groundedness_threshold,line_number
0,"What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,0,"What event started on July 28, 1914?","The event that started on July 28, 1914, was t...",0.125,pass,3,5.0,...,4.0,"The RESPONSE is well-articulated, coherent, an...",pass,3,4.0,4.0,The RESPONSE is accurate and directly answers ...,pass,3,0
1,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,1,Who was the first person to walk on the moon?,The first person to walk on the moon was Neil ...,0.086957,pass,3,5.0,...,4.0,"The RESPONSE is well-articulated, with good gr...",pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, di...",pass,3,1
2,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,2,What was the significance of the year 1776 in ...,The year 1776 is highly significant in America...,0.043478,pass,3,3.0,...,5.0,The RESPONSE reflects exceptional fluency due ...,pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, th...",pass,3,2
3,"Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,3,"Which wall fell in 1989, symbolizing the end o...","The Berlin Wall fell in 1989, symbolizing the ...",0.111111,pass,3,5.0,...,4.0,"The RESPONSE is well-articulated, with good gr...",pass,3,5.0,5.0,The RESPONSE fully answers the QUERY with prec...,pass,3,3
4,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,4,What ancient city was buried by the eruption o...,The ancient city that was buried by the erupti...,0.125,pass,3,5.0,...,3.0,The response is clear and grammatically correc...,pass,3,3.0,3.0,The RESPONSE provides accurate information but...,pass,3,4
5,Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,5,Who was the British Prime Minister during Worl...,"During World War II, there were two British Pr...",0.061538,pass,3,5.0,...,4.0,"The RESPONSE is well-written, with varied voca...",pass,3,5.0,5.0,The RESPONSE thoroughly answers the QUERY with...,pass,3,5
6,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,0.285714,pass,3,5.0,...,3.0,The response is clear and grammatically correc...,pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, di...",pass,3,6
7,Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,7,Which empire was ruled by Genghis Khan?,"Genghis Khan ruled the Mongol Empire, which be...",0.097561,pass,3,4.0,...,4.0,"The RESPONSE is well-articulated, coherent, an...",pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, di...",pass,3,7
8,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,0.01626,pass,3,5.0,...,5.0,The RESPONSE deserves a high score because it ...,pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, di...",pass,3,8
9,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,9,Which ancient wonder was located in Egypt and ...,The ancient wonder located in Egypt that serve...,0.148148,pass,3,5.0,...,4.0,"The RESPONSE is well-articulated, coherent, an...",pass,3,5.0,5.0,"The RESPONSE is fully correct and complete, di...",pass,3,9
