In [1]:
from IPython.display import clear_output

In [4]:
#%pip install azure-ai-evaluation
clear_output()

In [3]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

env_var = {
    "gpt-4o": {
        "endpoint": os.environ.get("AOAI_GPT4O_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
    "gpt-4o-mini": {
        "endpoint": os.environ.get("AOAI_GPT4O_MINI_ENDPOINT"),
        "key": os.environ.get("AZURE_OPENAI_API_KEY"),
    },
}

azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RG_NAME"),
    "project_name": os.environ.get("PROJECT_NAME"),
}

In [5]:
import pandas as pd

df = pd.read_json("evaluation_dataset.jsonl", lines=True)
print(df.head())

                                               query  \
0               What event started on July 28, 1914?   
1      Who was the first person to walk on the moon?   
2  What was the significance of the year 1776 in ...   
3  Which wall fell in 1989, symbolizing the end o...   
4  What ancient city was buried by the eruption o...   

                                             context  \
0  It involved multiple countries and lasted unti...   
1  The event occurred during the Apollo 11 missio...   
2  A key document was signed declaring independen...   
3       It divided a German city into East and West.   
4  The city's ruins were rediscovered in the 18th...   

                      ground_truth  
0                      World War I  
1                   Neil Armstrong  
2  The Declaration of Independence  
3                  The Berlin Wall  
4                          Pompeii  


In [6]:
from azure.ai.evaluation import AzureOpenAIModelConfiguration

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
)

In [7]:
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import QAEvaluator
from model_endpoints import ModelEndpoints
import random
from IPython.display import clear_output

qa_evaluator = QAEvaluator(model_config)

models = [
    "gpt-4o",
    "gpt-4o-mini",
]

path = "./evaluation_dataset.jsonl"
randomNum = random.randint(1111, 9999)

for model in models:
    results = evaluate(
        evaluation_name="Eval-Run" + str(randomNum) + "-" + model.title(),
        data=path,
        target=ModelEndpoints(env_var, model),
        evaluators={
            "qa": qa_evaluator,
        },
        azure_ai_project=azure_ai_project,
        evaluator_config={
            "qa": {
                "column_mapping": {
                    "query": "${data.query}",
                    "context": "${data.context}",
                    "ground_truth": "${data.ground_truth}",
                    "response": "${target.response}",
                },
            },
        },
    )
clear_output()

In [8]:
pd.DataFrame(results["rows"])

Unnamed: 0,outputs.query,outputs.response,inputs.query,inputs.context,inputs.ground_truth,outputs.qa.f1_score,outputs.qa.similarity,outputs.qa.gpt_similarity,outputs.qa.relevance,outputs.qa.gpt_relevance,...,outputs.qa.fluency,outputs.qa.gpt_fluency,outputs.qa.fluency_reason,outputs.qa.groundedness,outputs.qa.gpt_groundedness,outputs.qa.groundedness_reason,outputs.qa.coherence,outputs.qa.gpt_coherence,outputs.qa.coherence_reason,line_number
0,"What event started on July 28, 1914?","The event that started on July 28, 1914, was t...","What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,0.125,5,5,5,5,...,4,4,"The RESPONSE is well-articulated, coherent, an...",5,5,The RESPONSE directly answers the QUERY with p...,4,4,"The RESPONSE is coherent, directly answers the...",0
1,Who was the first person to walk on the moon?,The first person to walk on the moon was Neil ...,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,0.086957,5,5,5,5,...,4,4,The RESPONSE deserves a high score because it ...,5,5,The RESPONSE fully and accurately answers the ...,4,4,"The RESPONSE is coherent, well-structured, and...",1
2,What was the significance of the year 1776 in ...,The year 1776 is highly significant in America...,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,0.043165,3,3,5,5,...,5,5,"The RESPONSE is sophisticated, cohesive, and d...",5,5,The RESPONSE thoroughly and accurately address...,5,5,The RESPONSE is coherent and effectively addre...,2
3,"Which wall fell in 1989, symbolizing the end o...","The Berlin Wall fell in 1989, symbolizing the ...","Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,0.111111,5,5,5,5,...,4,4,"The response is well-articulated, coherent, an...",5,5,"The RESPONSE is accurate, complete, and direct...",5,5,"The RESPONSE is coherent, well-organized, and ...",3
4,What ancient city was buried by the eruption o...,The ancient city that was buried by the erupti...,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,0.125,5,5,4,4,...,3,3,"The response is clear and accurate, with no gr...",3,3,The RESPONSE is accurate and relevant to the Q...,4,4,The RESPONSE is coherent and effectively answe...,4
5,Who was the British Prime Minister during Worl...,"During World War II, there were two British Pr...",Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,0.074074,5,5,4,4,...,4,4,"The RESPONSE is well-articulated, coherent, an...",5,5,The RESPONSE thoroughly and accurately answers...,4,4,"The RESPONSE is coherent, effectively addresse...",5
6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,0.285714,5,5,4,4,...,3,3,The response is clear and grammatically correc...,5,5,The RESPONSE fully and accurately answers the ...,4,4,The RESPONSE is coherent and effectively addre...,6
7,Which empire was ruled by Genghis Khan?,"Genghis Khan ruled the Mongol Empire, which be...",Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,0.097561,4,4,5,5,...,4,4,"The RESPONSE is well-articulated, coherent, an...",5,5,"The RESPONSE is fully correct and complete, di...",5,5,"The RESPONSE is coherent, logically structured...",7
8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,0.014599,5,5,5,5,...,4,4,"The RESPONSE is well-articulated, with good gr...",5,5,"The RESPONSE is fully correct and complete, ad...",5,5,"The RESPONSE is coherent, well-organized, and ...",8
9,Which ancient wonder was located in Egypt and ...,The ancient wonder located in Egypt that serve...,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,0.148148,5,5,5,5,...,4,4,"The RESPONSE is well-articulated, coherent, an...",5,5,The RESPONSE fully answers the QUERY with prec...,4,4,"The RESPONSE is coherent, directly addresses t...",9
