# Custom evaluator

> https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/custom-evaluators

In [1]:
import datetime
import json
import os
import sys

from azure.ai.evaluation import AzureOpenAIModelConfiguration
from dotenv import load_dotenv
from promptflow.client import load_flow

In [2]:
sys.version

'3.10.14 (main, May  6 2024, 19:42:50) [GCC 11.2.0]'

In [3]:
print(f"Today is {datetime.datetime.today().strftime('%d-%b-%Y %H:%M:%S')}")

Today is 26-Jun-2025 12:48:56


In [4]:
load_dotenv("azure.env")

endpoint = os.getenv("endpoint")
key = os.getenv("key")

azure_deployment = "gpt-4.1"
api_version = "2024-10-21"

In [5]:
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=endpoint,
    api_key=key,
    azure_deployment=azure_deployment,
    api_version=api_version,
)

## Code-based evaluator example

In [6]:
class Answer_Length_Evaluator:
    def __init__(self):
        pass
    def __call__(self, *, answer: str, **kwargs):
        return {"answer_length": len(answer)}

In [7]:
answerlength_evaluator = Answer_Length_Evaluator()

In [8]:
answerlength_evaluator(answer="Hello")

{'answer_length': 5}

In [9]:
answerlength_evaluator(answer="Hi")

{'answer_length': 2}

## Prompt-based evaluators

In [10]:
prompty_file = "test.prompty"

In [11]:
!ls $prompty_file -lh

-rwxrwxrwx 1 root root 1.4K Jun 26 12:37 test.prompty


In [12]:
with open(prompty_file, 'r') as f:
    print(f.read())

---
name: Friendliness Evaluator
description: Friendliness Evaluator to measure warmth and approachability of answers.
model:
  api: chat
  configuration:
    type: azure_openai
    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
    azure_deployment: gpt-4o-mini
  parameters:
    model:
    temperature: 0.1
inputs:
  response:
    type: string
outputs:
  score:
    type: int
  explanation:
    type: string
---

system:
Friendliness assesses the warmth and approachability of the answer. Rate the friendliness of the response between one to five stars using the following scale:

One star: the answer is unfriendly or hostile

Two stars: the answer is mostly unfriendly

Three stars: the answer is neutral

Four stars: the answer is mostly friendly

Five stars: the answer is very friendly

Please assign a rating between 1 and 5 based on the tone and demeanor of the response.

**Example 1**
generated_query: I just don't feel like helping you! Your questions are getting very annoying.
output:
{"s

In [13]:
class Friendliness_Evaluator:
    def __init__(self, model_config):
        prompty_path = prompty_file
        self._flow = load_flow(source=prompty_path, model={"configuration": model_config})

    def __call__(self, *, response: str, **kwargs):
        llm_response = self._flow(response=response)
        try:
            response = json.loads(llm_response)
        except Exception as ex:
            response = llm_response
        return response

In [14]:
friendliness_eval = Friendliness_Evaluator(model_config)

In [15]:
friendliness_score = friendliness_eval(response="Not my fault")
friendliness_score

{'score': 2,
 'reason': 'The response is dismissive and lacks warmth or approachability. It does not engage with the user in a friendly or helpful manner.'}

In [16]:
friendliness_score = friendliness_eval(response="I am sorry. My fault")
friendliness_score

{'score': 4,
 'reason': 'The response is polite and acknowledges responsibility, which is friendly and considerate, though it could be warmer with additional reassurance or support.'}