In [1]:
import os
from azure.ai.evaluation import AzureOpenAIModelConfiguration
from dotenv import load_dotenv

load_dotenv('../../.env')

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
)

# Evaluating Response Quality with RelevanceEvaluator

In [2]:
from azure.ai.evaluation import RelevanceEvaluator

relevance_evaluator = RelevanceEvaluator(model_config)

# Evaluate a single query-response pair
result = relevance_evaluator(
    query="What is the capital of France?",
    response="The capital of France is Paris.",
)

print(result)

{'relevance': 4.0, 'gpt_relevance': 4.0, 'relevance_reason': 'The response directly and accurately answers the query by stating that Paris is the capital of France. It fully satisfies the question without omissions or additional context.', 'relevance_result': 'pass', 'relevance_threshold': 3}


# Evaluating Response Quality with CoherenceEvaluator

In [3]:
from azure.ai.evaluation import CoherenceEvaluator

coherence_evaluator = CoherenceEvaluator(model_config)

result = coherence_evaluator(
    query="What's the capital of France?", 
    response="Paris."
)
print(result)

{'coherence': 4.0, 'gpt_coherence': 4.0, 'coherence_reason': 'The response is coherent and directly answers the query, but it is minimal and does not showcase advanced organization or flow.', 'coherence_result': 'pass', 'coherence_threshold': 3}


# Evaluating Response Quality with FluencyEvaluator

In [4]:
from azure.ai.evaluation import FluencyEvaluator

fluency_evaluator = FluencyEvaluator(model_config)

result = fluency_evaluator(
    response="Paris."
)
print(result)

{'fluency': 1.0, 'gpt_fluency': 1.0, 'fluency_reason': 'The RESPONSE is a single word without any grammatical structure or context, making it incomprehensible and indicative of minimal command of the language.', 'fluency_result': 'fail', 'fluency_threshold': 3}


# Evaluating Response Quality with GroundednessEvaluator

In [5]:
from azure.ai.evaluation import GroundednessEvaluator

groundedness_evaluator = GroundednessEvaluator(model_config)

result = groundedness_evaluator(
    query="Who discovered penicillin?",
    context="Alexander Fleming discovered penicillin in 1928 while working at St. Mary's Hospital in London.",
    response="Alexander Fleming discovered penicillin in 1938.",
)
print(result)

{'groundedness': 3.0, 'gpt_groundedness': 3.0, 'groundedness_reason': 'The RESPONSE provides the correct name of the discoverer but includes an incorrect year, which makes it an attempt to respond but with erroneous details.', 'groundedness_result': 'pass', 'groundedness_threshold': 3}


# Creating custom evaluators

## Code-based evaluator

### Function-based evaluator

In [6]:
# Custom evaluator function to calculate response length
def response_length_evaluator(response, **kwargs):
    return {"response_length": len(response)}

# Example usage
result = response_length_evaluator(response="Hello, world!")
print(result)

{'response_length': 13}


### Class-based evaluator

In [7]:
# Custom class-based evaluator to check for blocked words
class BlocklistEvaluator:
    def __init__(self, blocklist):
        self.blocklist = blocklist

    def __call__(self, *, response: str, **kwargs):
        contains_blocked_word = any(word in response for word in self.blocklist)
        return {"contains_blocked_word": contains_blocked_word}
    
# Example usage
blocklist_evaluator = BlocklistEvaluator(blocklist=["bad", "evil", "worst"])
result = blocklist_evaluator(response="This is the worst response ever!")
print(result)

{'contains_blocked_word': True}


## Prompt-based evaluators

#### Helpfulness evaluator

In [8]:
from helpfulness import HelpfulnessEvaluator

helpfulness_evaluator = HelpfulnessEvaluator(model_config)

helpfulness_score = helpfulness_evaluator(
    query="What's the meaning of life?", 
    context="Arthur Schopenhauer was the first to explicitly ask the question, in an essay entitled 'Character'.", 
    response="The answer is 42."
)
print(helpfulness_score)

{'helpfulness': 1.0, 'helpfulness_reason': 'The RESPONSE is entirely unhelpful as it does not address the philosophical question or provide any useful information related to the CONTEXT or QUERY.'}


#### JSON accuracy evaluator

In [9]:
import json
from json_schema import JSONSchemaEvaluator

# Load jsons/example.jsonl file here
example_json_schema = json.load(open('jsons/example_schema.json', 'r'))

# Example JSON object
sample_json_data = json.load(open('jsons/poor_output.json', 'r'))

json_schema_evaluator = JSONSchemaEvaluator(model_config)
json_schema_score = json_schema_evaluator(json_output=sample_json_data, schema=example_json_schema)
print(json_schema_score)

{'json_schema': 0.5, 'json_schema_reason': 'The JSON output is partially correct but contains a missing required field ("companyName") in "companyInfo". All other fields, types, and formats are correct.'}


# Evaluating a dataset

In [None]:
import pandas as pd
from azure.ai.evaluation import evaluate, RetrievalEvaluator, RelevanceEvaluator, FluencyEvaluator, CoherenceEvaluator, GroundednessEvaluator
#from helpfulness import HelpfulnessEvaluator
from pprint import pprint
from model_endpoint import ModelEndpoint

ai_project_endpoint=os.environ["AI_PROJECT_ENDPOINT"]

# Define your evaluators
relevance_evaluator = RelevanceEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
retrieval_evaluator = RetrievalEvaluator(model_config)
helpfulness_evaluator = HelpfulnessEvaluator(model_config)

# Evaluate the dataset
result = evaluate(
    evaluation_name="Quality Evaluators - Single Model",
    data="evaluation_dataset.jsonl",
    target=ModelEndpoint(model_config),
    evaluators={
        # Performance and quality evaluators (AI-assisted)
        "relevance": relevance_evaluator,
        "coherence": coherence_evaluator,
        "fluency": fluency_evaluator,
        "groundedness": groundedness_evaluator,
        "retrieval": retrieval_evaluator,
        # Custom evaluators (code and prompt based)
        "helpfulness": helpfulness_evaluator,
    },
    evaluator_config={
        "relevance": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "coherence": {
            "column_mapping": {"response": "${target.response}", "query": "${data.query}"}
        },
        "fluency": {
            "column_mapping": {"response": "${target.response}"}
        },
        "groundedness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "retrieval": {
            "column_mapping": {"context": "${data.context}", "query": "${data.query}"}
        },
        "helpfulness": {
            "column_mapping": {"response": "${target.response}", "context": "${data.context}", "query": "${data.query}"}
        },
    },
    azure_ai_project=ai_project_endpoint,
    output_path="./evaluation_results.json",
)

print(f'AI Foundry URL: {result.get("studio_url")}')

In [11]:
pd.DataFrame(result["rows"])

Unnamed: 0,inputs.query,inputs.context,inputs.ground_truth,inputs.line_number,outputs.query,outputs.response,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.relevance.relevance_result,...,outputs.groundedness.groundedness_result,outputs.groundedness.groundedness_threshold,outputs.retrieval.retrieval,outputs.retrieval.gpt_retrieval,outputs.retrieval.retrieval_reason,outputs.retrieval.retrieval_result,outputs.retrieval.retrieval_threshold,outputs.helpfulness.helpfulness,outputs.helpfulness.helpfulness_reason,line_number
0,"What event started on July 28, 1914?",It involved multiple countries and lasted unti...,World War I,0,"What event started on July 28, 1914?","On **July 28, 1914**, **World War I** official...",5.0,5.0,The response directly answers the query by sta...,pass,...,pass,3,2.0,2.0,The context is partially relevant as it hints ...,fail,3,5.0,The RESPONSE is entirely helpful as it accurat...,0
1,Who was the first person to walk on the moon?,The event occurred during the Apollo 11 missio...,Neil Armstrong,1,Who was the first person to walk on the moon?,The first person to walk on the moon was **Nei...,5.0,5.0,The response directly answers the query by ide...,pass,...,pass,3,2.0,2.0,The context is partially relevant as it mentio...,fail,3,5.0,The RESPONSE fully answers the QUERY with accu...,1
2,What was the significance of the year 1776 in ...,A key document was signed declaring independen...,The Declaration of Independence,2,What was the significance of the year 1776 in ...,The year 1776 is one of the most significant i...,5.0,5.0,The response thoroughly addresses the query by...,pass,...,pass,3,5.0,5.0,The context fully addresses the query and plac...,pass,3,5.0,The RESPONSE is entirely helpful as it fully a...,2
3,"Which wall fell in 1989, symbolizing the end o...",It divided a German city into East and West.,The Berlin Wall,3,"Which wall fell in 1989, symbolizing the end o...","The **Berlin Wall** fell in 1989, symbolizing ...",5.0,5.0,The response directly answers the query by ide...,pass,...,pass,3,2.0,2.0,The context is partially relevant but does not...,fail,3,5.0,"The RESPONSE fully answers the QUERY, provides...",3
4,What ancient city was buried by the eruption o...,The city's ruins were rediscovered in the 18th...,Pompeii,4,What ancient city was buried by the eruption o...,The ancient city of **Pompeii** was famously b...,5.0,5.0,The response directly answers the query by ide...,pass,...,fail,3,2.0,2.0,The context is partially relevant but does not...,fail,3,5.0,The RESPONSE fully answers the QUERY and provi...,4
5,Who was the British Prime Minister during Worl...,"He is famous for his leadership and speeches, ...",Winston Churchill,5,Who was the British Prime Minister during Worl...,The British Prime Minister during most of Worl...,5.0,5.0,The response fully answers the query by identi...,pass,...,pass,3,2.0,2.0,The context is partially relevant as it hints ...,fail,3,5.0,The RESPONSE is entirely helpful as it accurat...,5
6,What was the name of the ship that sank on its...,It was deemed 'unsinkable' before it hit an ic...,RMS Titanic,6,What was the name of the ship that sank on its...,The ship that sank on its maiden voyage in 191...,5.0,5.0,The response directly answers the query by nam...,pass,...,pass,3,2.0,2.0,The context is partially relevant as it provid...,fail,3,5.0,The RESPONSE is entirely helpful as it accurat...,6
7,Which empire was ruled by Genghis Khan?,This empire became the largest contiguous land...,The Mongol Empire,7,Which empire was ruled by Genghis Khan?,"Genghis Khan ruled the **Mongol Empire**, whic...",5.0,5.0,The response directly answers the query by ide...,pass,...,pass,3,2.0,2.0,The context is partially relevant to the query...,fail,3,5.0,The RESPONSE is entirely helpful as it accurat...,7
8,What was the primary cause of the American Civ...,The conflict between the Northern and Southern...,Slavery,8,What was the primary cause of the American Civ...,The primary cause of the American Civil War wa...,5.0,5.0,The response thoroughly addresses the query by...,pass,...,pass,3,5.0,5.0,"The context is highly relevant to the query, d...",pass,3,5.0,The RESPONSE is entirely helpful as it compreh...,8
9,Which ancient wonder was located in Egypt and ...,It is the only one of the Seven Wonders of the...,The Great Pyramid of Giza,9,Which ancient wonder was located in Egypt and ...,The ancient wonder located in Egypt that serve...,5.0,5.0,The response fully answers the query by identi...,pass,...,pass,3,2.0,2.0,The context is partially relevant as it indire...,fail,3,5.0,"The RESPONSE fully answers the QUERY, providin...",9


# Document Retrieval Evaluator

In [12]:
from azure.ai.evaluation import DocumentRetrievalEvaluator

# these query_relevance_label are given by your human- or LLM-judges.
retrieval_ground_truth = [
    {
        "document_id": "1",
        "query_relevance_label": 4
    },
    {
        "document_id": "2",
        "query_relevance_label": 2
    },
    {
        "document_id": "3",
        "query_relevance_label": 3
    },
    {
        "document_id": "4",
        "query_relevance_label": 1
    },
    {
        "document_id": "5",
        "query_relevance_label": 0
    },
]
# the min and max of the label scores are inputs to document retrieval evaluator
ground_truth_label_min = 0
ground_truth_label_max = 4

# these relevance scores come from your search retrieval system
retrieved_documents = [
    {
        "document_id": "2",
        "relevance_score": 45.1
    },
    {
        "document_id": "6",
        "relevance_score": 35.8
    },
    {
        "document_id": "3",
        "relevance_score": 29.2
    },
    {
        "document_id": "5",
        "relevance_score": 25.4
    },
    {
        "document_id": "7",
        "relevance_score": 18.8
    },
]

document_retrieval_evaluator = DocumentRetrievalEvaluator(
    ground_truth_label_min=ground_truth_label_min, 
    ground_truth_label_max=ground_truth_label_max,
    ndcg_threshold = 0.5,
    xdcg_threshold = 50.0,
    fidelity_threshold = 0.5,
    top1_relevance_threshold = 50.0,
    top3_max_relevance_threshold = 50.0,
    total_retrieved_documents_threshold = 50,
    total_ground_truth_documents_threshold = 50
)
document_retrieval_evaluator(retrieval_ground_truth=retrieval_ground_truth, retrieved_documents=retrieved_documents)

{'ndcg@3': 0.31075932533963707,
 'xdcg@3': 39.285714285714285,
 'fidelity': 0.39285714285714285,
 'top1_relevance': 2,
 'top3_max_relevance': 3,
 'holes': 2,
 'holes_ratio': 0.4,
 'total_retrieved_documents': 5,
 'total_ground_truth_documents': 5,
 'ndcg@3_result': 'fail',
 'ndcg@3_threshold': 0.5,
 'ndcg@3_higher_is_better': True,
 'xdcg@3_result': 'fail',
 'xdcg@3_threshold': 50.0,
 'xdcg@3_higher_is_better': True,
 'fidelity_result': 'fail',
 'fidelity_threshold': 0.5,
 'fidelity_higher_is_better': True,
 'top1_relevance_result': 'fail',
 'top1_relevance_threshold': 50.0,
 'top1_relevance_higher_is_better': True,
 'top3_max_relevance_result': 'fail',
 'top3_max_relevance_threshold': 50.0,
 'top3_max_relevance_higher_is_better': True,
 'holes_result': 'fail',
 'holes_threshold': 0,
 'holes_higher_is_better': False,
 'holes_ratio_result': 'fail',
 'holes_ratio_threshold': 0,
 'holes_ratio_higher_is_better': False,
 'total_retrieved_documents_result': 'fail',
 'total_retrieved_document