In [1]:
import os
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import GEval
from deepeval.models import GeminiModel
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from deepeval.evaluate import CacheConfig
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict, Any
import glob
import json

In [2]:
@dataclass
class EvaluationResult:
    """Holds a single evaluation result with score and reasoning."""
    eval_id: str
    score: float
    reason: str
    
    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return asdict(self)

    def save_to_json(self, filename: str) -> None:
        """Save results to JSON file."""
        with open(filename, 'w') as f:
            json.dump(self.to_dict(), f, indent=4)

@dataclass
class ExperimentResults:
    """Holds all model results for the entire experiment."""
    results: List[EvaluationResult]
    
    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return {
            'results': [res.to_dict() for res in self.results]
        }
    
    def save_to_json(self, filename: str) -> None:
        """Save results to JSON file."""
        with open(filename, 'w') as f:
            json.dump(self.to_dict(), f, indent=4)

In [3]:
def read_file_contents(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

def file_content_iterator(root_dir, whitelist):
    for provider in os.listdir(root_dir):
        provider_path = os.path.join(root_dir, provider)
        if not os.path.isdir(provider_path):
            continue

        for model_name in os.listdir(provider_path):
            key = f"{provider}/{model_name}"
            if key in whitelist:
                model_path = os.path.join(provider_path, model_name)
                if not os.path.isdir(model_path):
                    continue
    
                txt_files = glob.glob(os.path.join(model_path, "generation_result_*.txt"))
                for txt_file in txt_files:
                    filename = os.path.basename(txt_file)
                    content = read_file_contents(txt_file)
                    yield provider, model_name, filename, content
             
def run_eval(root_dir, whitelist):
    results = []

    for provider, model, filename, content in file_content_iterator(root_dir, whitelist):
        print(f"Evaluating: {provider}/{model}/{filename}")
        test_case = LLMTestCase(
            input="",
            actual_output=content,
            expected_output=ground_truth
        )
        raw_eval_result = evaluate(
            test_cases=[test_case],
            metrics=[bdi_plan_correctness],
            #show_indicator=False,
            #display=None,
            #print_results=False
        )
        eval_id = f"{provider}/{model}"
        eval_result_dict = raw_eval_result.test_results[0].metrics_data[0]
        eval_result = EvaluationResult(
            eval_id=eval_id,
            score=eval_result_dict.score,
            reason=eval_result_dict.reason,
        )
        results.append(eval_result)
    return results

def load_whitelist(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f if line.strip() and not line.startswith('#'))

In [15]:
# Load variables from .env file into the environment
load_dotenv()

ground_truth = """
EVENT: achieve reach(Object)
CONDITIONS:
  - there_is(Object, here)
OPERATIONS:
  - <none>

EVENT: achieve reach(Object)
CONDITIONS:
  - there_is(Object, Direction)
OPERATIONS:
  - execute move(Direction)

EVENT: achieve reach(Object)
CONDITIONS:
  - not(there_is(Object, _))
OPERATIONS:
  - execute getDirectionToMove(Direction)
  - execute move(Direction)
  - achieve reach(Object)
"""

judge_deep_eval = GeminiModel(
    model_name="gemini-2.5-pro", # gemini-2.5-flash
    api_key=os.environ.get("GOOGLE_API_KEY"),
)

bdi_plan_correctness = GEval(
    name="BDI Plan Correctness and Minimality",
    evaluation_steps=[
        "Extract invented goals, beliefs, and plans from the 'actual output'.",
        "Compare extracted plans against 'expected output' plans for logical equivalence and coverage.",
        "Assess if invented goals/beliefs are necessary or add needless complexity compared to 'expected output'.",
        "Evaluate plan minimality; penalize unnecessary subgoals, conditions, or operations vs 'expected output'.",
        "Verify that operations correctly use specified prefixes (execute, achieve, add, etc.) and admissible actions.",
        "Check if conditions logically correspond to the intended plan activation scenario.",
        "Score based on plan correctness, necessity of inventions, and adherence to minimality principle."
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=judge_deep_eval
)

In [None]:
root_dir = "../metrics"

whitelist_filename = "whitelist.txt"
whitelist = load_whitelist(whitelist_filename)

evals = run_eval(root_dir, whitelist)
ExperimentResults(evals).save_to_json("result.json")