# Clinia API Entities Agent Evaluation

This notebook evaluates the performance of `clinia_api_entities_agent_ai.py`.

It measures:
1. Runtime for the agent run
2. Total tokens consumed (prompt + completion)
3. Accuracy of the generated entity map compared to the ground-truth `entities_map.json`

In [26]:
# Imports and setup
import asyncio
import csv
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path

import nest_asyncio

from clinia_api_entities_agent_ai import CliniaDocAgentsDeps, clinia_docs_agent, clinia_docs_agent_prompt
from utils import get_clients

# Permet d'imbriquer des boucles asyncio (nécessaire pour Jupyter)
nest_asyncio.apply()

repo_root = Path.cwd().parent
sys.path.append(str(repo_root / 'src'))



# Methods definition for evaluation


In [27]:
def prepare_dependencies():
    embedding_client, supabase = get_clients()
    deps = CliniaDocAgentsDeps(supabase=supabase, embedding_client=embedding_client)
    return deps

In [28]:
async def run_agent(query, deps):
    start = time.perf_counter()
    resp = await clinia_docs_agent.run(query, deps=deps)
    runtime = time.perf_counter() - start
    return resp, runtime

# Version synchrone qui utilise nest_asyncio
def run_agent_sync(query, deps):
    return asyncio.run(run_agent(query, deps))

In [29]:
def append_results_to_csv(csv_path, row, header):
    file_exists = os.path.isfile(csv_path)
    with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(header)
        writer.writerow(row)

# Evaluation methods
This method evaluates the accuracy of the agent by running the agent on a set of sample data. We are looking for keyword in the answer to validate that the agent is working correctly. 

In [30]:
# Nouvelle fonction pour évaluer l'agent avec les questions/réponses du sample_data.json

def run_sample_data_evaluation(sample_path='../evals/data/sample_data.json', csv_path='results_sample_data.csv'):

    eval_launch_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    deps = prepare_dependencies()
    with open(sample_path, 'r', encoding='utf-8') as f:
        samples = json.load(f)
    header = [
        'eval_launch_time', 'question', 'expected_answer', 'agent_response', 'runtime_seconds', 'all_terms_found', 'missing_terms'
    ]
    for sample in samples:
        question = sample['question']
        expected_answer = sample['answer']
        response, runtime_seconds = run_agent_sync(question, deps)
        agent_output = response.data if hasattr(response, 'data') else str(response)
        # Découper la réponse attendue en termes (par virgule)
        terms = [t.strip().lower() for t in expected_answer.split(',')]
        # Vérifier la présence de chaque terme dans la réponse de l'agent (insensible à la casse)
        agent_output_lower = agent_output.lower()
        missing_terms = [t for t in terms if t and t not in agent_output_lower]
        all_terms_found = len(missing_terms) == 0
        row = [
            eval_launch_time,
            question,
            expected_answer,
            agent_output,
            f'{runtime_seconds:.2f}',
            all_terms_found,
            ';'.join(missing_terms)
        ]
        append_results_to_csv(csv_path, row, header)
    print(f"Évaluation terminée. Résultats enregistrés dans {csv_path}")

In [31]:
# Lancer l'évaluation sur les données d'exemple et enregistrer les résultats
run_sample_data_evaluation()

15:01:41.516 clinia_docs_agent run prompt=What is the module used for data management
15:01:41.517   preparing model and tools run_step=1
15:01:41.518   model request
15:01:41.521     Chat Completion with 'gpt-4.1-mini' [LLM]
15:01:42.299   handle model response
15:01:42.300     running tools=['retrieve_relevant_documentation']
15:01:42.300     create embedding for search_query=module used for data management
15:01:42.301       Embedding Creation with 'text-embedding-3-small' [LLM]
15:01:42.299   handle model response
15:01:42.300     running tools=['retrieve_relevant_documentation']
15:01:42.300     create embedding for search_query=module used for data management
15:01:42.301       Embedding Creation with 'text-embedding-3-small' [LLM]
15:01:43.300   preparing model and tools run_step=2
15:01:43.301   model request
15:01:43.306     Chat Completion with 'gpt-4.1-mini' [LLM]
15:01:43.300   preparing model and tools run_step=2
15:01:43.301   model request
15:01:43.306     Chat Completio