# Clinia API Entities Agent Evaluation

This notebook evaluates the performance of `clinia_api_entities_agent_ai.py`.

It measures:
1. Runtime for the agent run
2. Total tokens consumed (prompt + completion)
3. Accuracy of the generated entity map compared to the ground-truth `entities_map.json`

In [44]:
# Imports and setup
import asyncio
import csv
import json
import os
import re
import sys
import time
from pathlib import Path

import nest_asyncio
import tiktoken

from clinia_api_entities_agent_ai import CliniaModuleAgentDeps, get_clients, module_agent_prompt, tools_refiner_agent

# Permet d'imbriquer des boucles asyncio (nécessaire pour Jupyter)
nest_asyncio.apply()

repo_root = Path.cwd().parent
sys.path.append(str(repo_root / 'src'))



In [45]:
# Helper to count tokens
def count_tokens(text: str, model: str = 'gpt-4o-mini') -> int:
    try:
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        enc = tiktoken.get_encoding('cl100k_base')
    return len(enc.encode(text))

In [46]:
def prepare_dependencies():
    embedding_client, supabase = get_clients()
    deps = CliniaModuleAgentDeps(supabase=supabase, embedding_client=embedding_client)
    return deps

In [47]:
async def run_agent(query, deps):
    start = time.perf_counter()
    resp = await tools_refiner_agent.run(query, deps=deps)
    runtime = time.perf_counter() - start
    return resp, runtime

# Version synchrone qui utilise nest_asyncio
def run_agent_sync(query, deps):
    return asyncio.run(run_agent(query, deps))

In [48]:
def compute_tokens(prompt, completion, model='gpt-4o-mini'):
    prompt_tokens = count_tokens(prompt, model)
    completion_tokens = count_tokens(completion, model)
    total_tokens = prompt_tokens + completion_tokens
    return prompt_tokens, completion_tokens, total_tokens


In [49]:
def evaluate_accuracy(markdown_output):
    gt_path = Path.cwd().parent / 'evals' / 'data' / 'entities_map.json'
    with open(gt_path, 'r', encoding='utf-8') as f:
        gt = json.load(f)
    truth_entities = {n['id'] for n in gt['nodes']}
    found_entities = set(re.findall(r'^#\s*Entity:\s*(.+)$', markdown_output, flags=re.MULTILINE))
    correct = truth_entities & found_entities
    accuracy = len(correct) / len(truth_entities) if truth_entities else 0
    missing = truth_entities - found_entities
    extra = found_entities - truth_entities
    return truth_entities, found_entities, correct, accuracy, missing, extra

In [50]:
def append_results_to_csv(csv_path, row, header):
    file_exists = os.path.isfile(csv_path)
    with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(header)
        writer.writerow(row)

In [51]:
def run_full_evaluation(query, csv_path='results.csv'):
    deps = prepare_dependencies()

    # Utiliser la version synchrone pour Jupyter
    response, runtime_seconds = run_agent_sync(query, deps)

    markdown_output = response.data
    prompt = module_agent_prompt + query
    prompt_tokens, completion_tokens, total_tokens = compute_tokens(prompt, markdown_output)
    truth_entities, found_entities, correct, accuracy, missing, extra = evaluate_accuracy(markdown_output)
    header = [
        'query', 'runtime_seconds', 'prompt_tokens', 'completion_tokens', 'total_tokens',
        'ground_truth_entities', 'generated_entities', 'correctly_generated', 'accuracy', 'missing', 'extra'
    ]
    row = [
        query, f'{runtime_seconds:.2f}', prompt_tokens, completion_tokens, total_tokens,
        len(truth_entities), len(found_entities), len(correct), f'{accuracy:.2%}',
        ';'.join(sorted(missing)), ';'.join(sorted(extra))
    ]
    append_results_to_csv(csv_path, row, header)
    return {
        'runtime_seconds': runtime_seconds,
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'total_tokens': total_tokens,
        'ground_truth_entities': len(truth_entities),
        'generated_entities': len(found_entities),
        'correctly_generated': len(correct),
        'accuracy': accuracy,
        'missing': missing,
        'extra': extra
    }

In [52]:
# Si nest_asyncio n'est pas installé, décommentez et exécutez cette cellule
# !pip install nest_asyncio

# Exemple d'exécution asynchrone directement dans une cellule Jupyter
# Pour exécuter cette cellule, décommentez le code et utilisez la commande magique %%cell_magic

'''
%%cell_magic

async def main():
    deps = prepare_dependencies()
    response, runtime = await run_agent("votre requête ici", deps)
    return response, runtime

result = await main()
print(result)
'''

'\n%%cell_magic\n\nasync def main():\n    deps = prepare_dependencies()\n    response, runtime = await run_agent("votre requête ici", deps)\n    return response, runtime\n\nresult = await main()\nprint(result)\n'

In [53]:
# Version asynchrone de la fonction d'évaluation
async def run_full_evaluation_async(query, csv_path='results.csv'):
    deps = prepare_dependencies()
    response, runtime_seconds = await run_agent(query, deps)
    markdown_output = response.data
    prompt = module_agent_prompt + query
    prompt_tokens, completion_tokens, total_tokens = compute_tokens(prompt, markdown_output)
    truth_entities, found_entities, correct, accuracy, missing, extra = evaluate_accuracy(markdown_output)
    header = [
        'query', 'runtime_seconds', 'prompt_tokens', 'completion_tokens', 'total_tokens',
        'ground_truth_entities', 'generated_entities', 'correctly_generated', 'accuracy', 'missing', 'extra'
    ]
    row = [
        query, f'{runtime_seconds:.2f}', prompt_tokens, completion_tokens, total_tokens,
        len(truth_entities), len(found_entities), len(correct), f'{accuracy:.2%}',
        ';'.join(sorted(missing)), ';'.join(sorted(extra))
    ]
    append_results_to_csv(csv_path, row, header)
    return {
        'runtime_seconds': runtime_seconds,
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'total_tokens': total_tokens,
        'ground_truth_entities': len(truth_entities),
        'generated_entities': len(found_entities),
        'correctly_generated': len(correct),
        'accuracy': accuracy,
        'missing': missing,
        'extra': extra
    }

In [54]:
# Appeler l'évaluation complète et enregistrer les résultats dans le CSV
query = "generate the markdown file containing the different entities from the clinia api documentation"

# Utiliser la version synchrone de l'évaluation
eval_results = run_full_evaluation(query)
print("Evaluation Results:")
print(eval_results)

01:57:42.419 tools_refiner_agent run prompt=generate the markdown file containing the different entities from the clinia api documentation
01:57:42.433   preparing model and tools run_step=1
01:57:42.434   model request
01:57:42.443     Chat Completion with 'gpt-4.1-mini' [LLM]
01:57:43.797   handle model response
01:57:43.798     running tools=['retrieve_relevant_documentation']
01:57:43.799     Embedding Creation with 'text-embedding-3-small' [LLM]
01:57:43.797   handle model response
01:57:43.798     running tools=['retrieve_relevant_documentation']
01:57:43.799     Embedding Creation with 'text-embedding-3-small' [LLM]
01:57:45.958   preparing model and tools run_step=2
01:57:45.959   model request
01:57:45.962     Chat Completion with 'gpt-4.1-mini' [LLM]
01:57:45.958   preparing model and tools run_step=2
01:57:45.959   model request
01:57:45.962     Chat Completion with 'gpt-4.1-mini' [LLM]
01:57:47.003   handle model response
01:57:47.004     running tools=['retrieve_relevant_d