In [1]:
from IPython.display import clear_output

In [2]:
#%pip install azure-ai-evaluation
clear_output()

In [3]:
import os
from azure.ai.evaluation import AzureOpenAIModelConfiguration
from dotenv import load_dotenv

load_dotenv('../../.env')

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
)

# Evaluating Response Quality with RelevanceEvaluator

In [4]:
from azure.ai.evaluation import RelevanceEvaluator

relevance_evaluator = RelevanceEvaluator(model_config)

# Evaluate a single query-response pair
result = relevance_evaluator(
    query="What is the capital of France?",
    response="The capital of France is Paris.",
)

print(result)

{'relevance': 4.0, 'gpt_relevance': 4.0, 'relevance_reason': 'The response fully and accurately answers the question with all essential details, but does not provide any extra insights.', 'relevance_result': 'pass', 'relevance_threshold': 3}


# Evaluating Response Quality with CoherenceEvaluator

In [4]:
from azure.ai.evaluation import CoherenceEvaluator

coherence_evaluator = CoherenceEvaluator(model_config)

result = coherence_evaluator(
    query="What's the capital of France?", 
    response="Paris."
)
print(result)

{'coherence': 4.0, 'gpt_coherence': 4.0, 'coherence_reason': 'The RESPONSE is coherent and effectively answers the QUERY with logical organization and clarity, though it is minimalistic.'}


# Evaluating Response Quality with FluencyEvaluator

In [5]:
from azure.ai.evaluation import FluencyEvaluator

fluency_evaluator = FluencyEvaluator(model_config)

result = fluency_evaluator(
    response="Paris."
)
print(result)

{'fluency': 1.0, 'gpt_fluency': 1.0, 'fluency_reason': 'The RESPONSE is a single word without any grammatical structure or context, making it incomprehensible and indicative of minimal command of the language.'}


# Evaluating Response Quality with GroundednessEvaluator

In [6]:
from azure.ai.evaluation import GroundednessEvaluator

groundedness_evaluator = GroundednessEvaluator(model_config)

result = groundedness_evaluator(
    query="Who discovered penicillin?",
    context="Alexander Fleming discovered penicillin in 1928 while working at St. Mary's Hospital in London.",
    response="Alexander Fleming discovered penicillin in 1928.",
)
print(result)

{'groundedness': 4.0, 'gpt_groundedness': 4.0, 'groundedness_reason': 'The response is accurate and directly answers the query but does not include all the details from the context, making it partially correct.'}


# Creating custom evaluators

## Code-based evaluator

### Function-based evaluator

In [7]:
# Custom evaluator function to calculate response length
def response_length_evaluator(response, **kwargs):
    return {"response_length": len(response)}

# Example usage
result = response_length_evaluator(response="Hello, world!")
print(result)

{'response_length': 13}


### Class-based evaluator

In [8]:
# Custom class-based evaluator to check for blocked words
class BlocklistEvaluator:
    def __init__(self, blocklist):
        self.blocklist = blocklist

    def __call__(self, *, response: str, **kwargs):
        contains_blocked_word = any(word in response for word in self.blocklist)
        return {"contains_blocked_word": contains_blocked_word}
    
# Example usage
blocklist_evaluator = BlocklistEvaluator(blocklist=["bad", "evil", "worst"])
result = blocklist_evaluator(response="This is the worst response ever!")
print(result)

{'contains_blocked_word': True}


## Prompt-based evaluators

#### Helpfulness evaluator

In [9]:
from helpfulness import HelpfulnessEvaluator

helpfulness_evaluator = HelpfulnessEvaluator(model_config)

helpfulness_score = helpfulness_evaluator(
    query="What's the meaning of life?", 
    context="Arthur Schopenhauer was the first to explicitly ask the question, in an essay entitled 'Character'.", 
    response="The answer is 42."
)
print(helpfulness_score)

{'helpfulness': 1.0, 'helpfulness_reason': 'The RESPONSE is entirely unhelpful as it does not address the philosophical question or provide any meaningful or relevant information based on the CONTEXT.'}


#### JSON accuracy evaluator

In [10]:
import json
from json_schema import JSONSchemaEvaluator

# Load jsons/example.jsonl file here
example_json_schema = json.load(open('jsons/example_schema.json', 'r'))

# Example JSON object
sample_json_data = json.load(open('jsons/poor_output.json', 'r'))

json_schema_evaluator = JSONSchemaEvaluator(model_config)
json_schema_score = json_schema_evaluator(json_output=sample_json_data, schema=example_json_schema)
print(json_schema_score)

{'json_schema': 0.5, 'json_schema_reason': 'The JSON output is partially correct but contains a critical error: the required "companyName" field is missing in the "companyInfo" object.'}


# Evaluating a dataset

In [12]:
import pandas as pd
from azure.ai.evaluation import evaluate, RetrievalEvaluator, RelevanceEvaluator
from pprint import pprint
from model_endpoint import ModelEndpoint
from IPython.display import clear_output

azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RG_NAME"),
    "project_name": os.environ.get("PROJECT_NAME"),
}

# Define your evaluators
relevance_evaluator = RelevanceEvaluator(model_config)
retrieval_evaluator = RetrievalEvaluator(model_config)

# Evaluate the dataset
result = evaluate(
    data="evaluation_dataset.jsonl",
    target=ModelEndpoint(model_config),
    evaluators={
        # Performance and quality evaluators (AI-assisted)
        "relevance": relevance_evaluator,
        "coherence": coherence_evaluator,
        "fluency": fluency_evaluator,
        "groundedness": groundedness_evaluator,
        "retrieval": retrieval_evaluator,
        # Custom evaluators (code and prompt based)
        "helpfulness": helpfulness_evaluator,
    },
    evaluator_config={
        "relevance": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "coherence": {
            "column_mapping": {"response": "${target.response}", "query": "${data.query}"}
        },
        "fluency": {
            "column_mapping": {"response": "${target.response}"}
        },
        "groundedness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "retrieval": {
            "column_mapping": {"context": "${data.context}", "query": "${data.query}"}
        },
        "helpfulness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
    },
    azure_ai_project=azure_ai_project,
    output_path="./evaluation_results.json",
)
clear_output()

In [13]:
pd.DataFrame(result["rows"])

Unnamed: 0,outputs.query,outputs.response,inputs.query,inputs.context,inputs.ground_truth,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.coherence.coherence,outputs.coherence.gpt_coherence,...,outputs.fluency.fluency_reason,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.retrieval.retrieval,outputs.retrieval.gpt_retrieval,outputs.retrieval.retrieval_reason,outputs.helpfulness.helpfulness,outputs.helpfulness.helpfulness_reason,line_number
0,"What event started on July 28, 1914?","On **July 28, 1914**, **World War I** official...","What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,5,5,The RESPONSE fully addresses the QUERY with ac...,5,5,...,"The RESPONSE is well-articulated, coherent, an...",5,5,"The RESPONSE is fully correct and complete, di...",3,3,The context is relevant to the query but is mi...,5,The RESPONSE fully answers the QUERY with accu...,0
1,Who was the first person to walk on the moon?,The first person to walk on the Moon was **Nei...,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,5,5,The RESPONSE fully addresses the QUERY with ac...,4,4,...,"The RESPONSE is well-articulated, coherent, an...",5,5,"The RESPONSE is fully correct and complete, ad...",2,2,The context is partially relevant as it mentio...,5,The RESPONSE is entirely helpful as it answers...,1
2,What was the significance of the year 1776 in ...,The year 1776 is one of the most significant i...,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,5,5,The RESPONSE fully addresses the QUERY with ac...,5,5,...,"The RESPONSE is highly coherent, grammatically...",5,5,The RESPONSE thoroughly and accurately answers...,5,5,"The context is highly relevant, well-ranked, a...",5,The RESPONSE is entirely helpful as it fully a...,2
3,"Which wall fell in 1989, symbolizing the end o...","The **Berlin Wall** fell in 1989, symbolizing ...","Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,5,5,The RESPONSE fully answers the QUERY with accu...,5,5,...,"The RESPONSE is well-articulated, coherent, an...",5,5,The RESPONSE fully answers the QUERY with prec...,2,2,The context is partially relevant to the query...,5,The RESPONSE fully answers the QUERY with accu...,3
4,What ancient city was buried by the eruption o...,The ancient city of **Pompeii** was buried by ...,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,5,5,The RESPONSE fully answers the QUERY with accu...,4,4,...,"The RESPONSE is well-articulated, coherent, an...",2,2,The RESPONSE is accurate and relevant to the Q...,1,1,The context does not provide relevant informat...,5,The RESPONSE fully answers the QUERY and provi...,4
5,Who was the British Prime Minister during Worl...,The British Prime Minister during most of Worl...,Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,5,5,The RESPONSE fully answers the QUERY with accu...,5,5,...,"The RESPONSE is well-articulated, coherent, an...",4,4,The RESPONSE is mostly correct and relevant bu...,2,2,The context is partially relevant as it hints ...,5,The RESPONSE fully answers the QUERY with accu...,5
6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,5,5,The RESPONSE fully answers the QUERY with accu...,4,4,...,"The RESPONSE is well-articulated, coherent, an...",5,5,"The RESPONSE is fully correct and complete, di...",2,2,The context is partially relevant as it hints ...,5,The RESPONSE is entirely helpful as it accurat...,6
7,Which empire was ruled by Genghis Khan?,"Genghis Khan ruled the **Mongol Empire**, whic...",Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,5,5,The RESPONSE fully answers the QUERY with accu...,5,5,...,"The RESPONSE is well-articulated, coherent, an...",5,5,The RESPONSE fully answers the QUERY with prec...,2,2,The context is partially relevant to the query...,5,The RESPONSE fully answers the QUERY and provi...,7
8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,4,4,The RESPONSE fully addresses the QUERY with ac...,4,4,...,"The RESPONSE is highly proficient, with clear,...",5,5,The RESPONSE fully addresses the QUERY with pr...,5,5,"The context is highly relevant to the query, d...",5,"The RESPONSE fully addresses the QUERY, accura...",8
9,Which ancient wonder was located in Egypt and ...,The **Great Pyramid of Giza** was the ancient ...,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,5,5,The RESPONSE fully addresses the QUERY with ac...,4,4,...,"The RESPONSE is well-written, with clear and c...",5,5,The RESPONSE fully answers the QUERY with prec...,2,2,The context is partially relevant to the query...,5,The RESPONSE fully answers the QUERY and provi...,9


In [14]:
!pip install --upgrade azure-ai-evaluation




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from azure.ai.evaluation import DocumentRetrievalEvaluator

# these query_relevance_label are given by your human- or LLM-judges.
retrieval_ground_truth = [
    {
        "document_id": "1",
        "query_relevance_label": 4
    },
    {
        "document_id": "2",
        "query_relevance_label": 2
    },
    {
        "document_id": "3",
        "query_relevance_label": 3
    },
    {
        "document_id": "4",
        "query_relevance_label": 1
    },
    {
        "document_id": "5",
        "query_relevance_label": 0
    },
]
# the min and max of the label scores are inputs to document retrieval evaluator
ground_truth_label_min = 0
ground_truth_label_max = 4

# these relevance scores come from your search retrieval system
retrieved_documents = [
    {
        "document_id": "2",
        "relevance_score": 45.1
    },
    {
        "document_id": "6",
        "relevance_score": 35.8
    },
    {
        "document_id": "3",
        "relevance_score": 29.2
    },
    {
        "document_id": "5",
        "relevance_score": 25.4
    },
    {
        "document_id": "7",
        "relevance_score": 18.8
    },
]

document_retrieval_evaluator = DocumentRetrievalEvaluator(
    ground_truth_label_min=ground_truth_label_min, 
    ground_truth_label_max=ground_truth_label_max,
    ndcg_threshold = 0.5,
    xdcg_threshold = 50.0,
    fidelity_threshold = 0.5,
    top1_relevance_threshold = 50.0,
    top3_max_relevance_threshold = 50.0,
    total_retrieved_documents_threshold = 50,
    total_ground_truth_documents_threshold = 50
)
document_retrieval_evaluator(retrieval_ground_truth=retrieval_ground_truth, retrieved_documents=retrieved_documents)

{'ndcg@3': 0.31075932533963707,
 'xdcg@3': 39.285714285714285,
 'fidelity': 0.39285714285714285,
 'top1_relevance': 2,
 'top3_max_relevance': 3,
 'holes': 2,
 'holes_ratio': 0.4,
 'total_retrieved_documents': 5,
 'total_ground_truth_documents': 5,
 'ndcg@3_result': 'fail',
 'ndcg@3_threshold': 0.5,
 'ndcg@3_higher_is_better': True,
 'xdcg@3_result': 'fail',
 'xdcg@3_threshold': 50.0,
 'xdcg@3_higher_is_better': True,
 'fidelity_result': 'fail',
 'fidelity_threshold': 0.5,
 'fidelity_higher_is_better': True,
 'top1_relevance_result': 'fail',
 'top1_relevance_threshold': 50.0,
 'top1_relevance_higher_is_better': True,
 'top3_max_relevance_result': 'fail',
 'top3_max_relevance_threshold': 50.0,
 'top3_max_relevance_higher_is_better': True,
 'holes_result': 'fail',
 'holes_threshold': 0,
 'holes_higher_is_better': False,
 'holes_ratio_result': 'fail',
 'holes_ratio_threshold': 0,
 'holes_ratio_higher_is_better': False,
 'total_retrieved_documents_result': 'fail',
 'total_retrieved_document