In [1]:
from IPython.display import clear_output

In [2]:
#%pip install azure-ai-evaluation
clear_output()

In [3]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

env_var = {
    "gpt-4o": {
        "endpoint": os.environ.get("AOAI_GPT4O_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
    "gpt-4o-mini": {
        "endpoint": os.environ.get("AOAI_GPT4O_MINI_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
}

azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RG_NAME"),
    "project_name": os.environ.get("PROJECT_NAME"),
}

In [4]:
import pandas as pd

df = pd.read_json("evaluation_dataset.jsonl", lines=True)
print(df.head())

                                               query  \
0               What event started on July 28, 1914?   
1      Who was the first person to walk on the moon?   
2  What was the significance of the year 1776 in ...   
3  Which wall fell in 1989, symbolizing the end o...   
4  What ancient city was buried by the eruption o...   

                                             context  \
0  It involved multiple countries and lasted unti...   
1  The event occurred during the Apollo 11 missio...   
2  A key document was signed declaring independen...   
3       It divided a German city into East and West.   
4  The city's ruins were rediscovered in the 18th...   

                      ground_truth  
0                      World War I  
1                   Neil Armstrong  
2  The Declaration of Independence  
3                  The Berlin Wall  
4                          Pompeii  


In [5]:
from azure.ai.evaluation import AzureOpenAIModelConfiguration

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
)

In [6]:
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import QAEvaluator
from model_endpoints import ModelEndpoints
import random
from IPython.display import clear_output

qa_evaluator = QAEvaluator(model_config)

models = [
    "gpt-4o",
    "gpt-4o-mini",
]

path = "./evaluation_dataset.jsonl"
randomNum = random.randint(1111, 9999)

for model in models:
    results = evaluate(
        evaluation_name="Eval-Run" + str(randomNum) + "-" + model.title(),
        data=path,
        target=ModelEndpoints(env_var, model),
        evaluators={
            "qa": qa_evaluator,
        },
        azure_ai_project=azure_ai_project,
        evaluator_config={
            "qa": {
                "column_mapping": {
                    "query": "${data.query}",
                    "context": "${data.context}",
                    "ground_truth": "${data.ground_truth}",
                    "response": "${target.response}",
                },
            },
        },
    )
clear_output()

In [7]:
pd.DataFrame(results["rows"])

Unnamed: 0,outputs.query,outputs.response,inputs.query,inputs.context,inputs.ground_truth,outputs.qa.groundedness,outputs.qa.gpt_groundedness,outputs.qa.groundedness_reason,outputs.qa.relevance,outputs.qa.gpt_relevance,...,outputs.qa.coherence,outputs.qa.gpt_coherence,outputs.qa.coherence_reason,outputs.qa.fluency,outputs.qa.gpt_fluency,outputs.qa.fluency_reason,outputs.qa.similarity,outputs.qa.gpt_similarity,outputs.qa.f1_score,line_number
0,"What event started on July 28, 1914?","The event that started on July 28, 1914, was t...","What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,5,5,The response accurately identifies the event a...,5,5,...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE is well-articulated with good con...,5,5,0.125,0
1,Who was the first person to walk on the moon?,The first person to walk on the moon was Neil ...,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,5,5,The response accurately and completely answers...,5,5,...,4,4,The response is coherent and effectively addre...,4,4,"The RESPONSE is well-articulated, with good co...",4,4,0.086957,1
2,What was the significance of the year 1776 in ...,The year 1776 is highly significant in America...,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,5,5,The RESPONSE is fully grounded in the CONTEXT ...,5,5,...,5,5,The RESPONSE is coherent and effectively addre...,4,4,"The RESPONSE is well-articulated, with good co...",3,3,0.040816,2
3,"Which wall fell in 1989, symbolizing the end o...","The Berlin Wall fell in 1989, symbolizing the ...","Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,5,5,The response accurately and completely answers...,5,5,...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE is well-articulated with good gra...,5,5,0.114286,3
4,What ancient city was buried by the eruption o...,The ancient city that was buried by the erupti...,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,3,3,The response correctly identifies Pompeii as t...,4,4,...,4,4,The response is coherent as it directly answer...,3,3,The RESPONSE is clear and grammatically correc...,5,5,0.125,4
5,Who was the British Prime Minister during Worl...,"During World War II, there were two British Pr...",Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,5,5,"The response is fully correct and complete, ac...",5,5,...,4,4,The response is coherent and effectively addre...,4,4,"The RESPONSE is well-articulated, with good co...",4,4,0.074074,5
6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,5,5,"The response is fully correct and complete, di...",4,4,...,4,4,The RESPONSE is coherent as it directly answer...,3,3,The response is clear and grammatically correc...,5,5,0.285714,6
7,Which empire was ruled by Genghis Khan?,"Genghis Khan ruled the Mongol Empire, which be...",Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,5,5,The response accurately and completely answers...,5,5,...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE is well-articulated with good con...,4,4,0.097561,7
8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,5,5,The RESPONSE accurately and comprehensively ad...,5,5,...,4,4,The RESPONSE is coherent and effectively addre...,4,4,"The RESPONSE is well-structured, coherent, and...",3,3,0.015748,8
9,Which ancient wonder was located in Egypt and ...,The ancient wonder located in Egypt that serve...,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,5,5,"The response is fully correct and complete, pr...",5,5,...,4,4,The RESPONSE is coherent and effectively addre...,4,4,"The RESPONSE is well-articulated, with good co...",5,5,0.148148,9
