In [3]:
import yaml
from pprint import pprint

with open('../examples/eval_template.yaml') as f:
    eval_template = yaml.safe_load(f)

with open('../examples/result_example.yaml') as f:
    result_example = yaml.safe_load(f)

pprint(eval_template)
print('---')
pprint(result_example)

{'ideal_responses': ['This is the ideal response (to the first prompt).\n'
                     'This field is optional.\n'
                     'It is only used if a test below is of type '
                     '`llm-similarity`, in which case it is used as the ideal '
                     'response and the LLM is asked to evaluate the similarity '
                     'of the response to the ideal answer.\n'
                     'This value could also be used to fine-tune a model.\n',
                     'This is the ideal response to the second prompt.'],
 'metadata': {'description': 'This is an example of an eval. An eval contains '
                             'a set of prompts and tests.',
              'difficulty': 1,
              'name': 'Eval Example',
              'source': 'Anaconda',
              'tags': ['example', 'graphing', 'plotly-express'],
              'uuid': '5f8b1b4e-3b7e-4b0e-8b1b-4e3b7e4b0e8b'},
 'prompts': ['This is a question/prompt.',
             'This

In [4]:
class TestResult:
    def __init__(self, passed, description):
        self.passed = passed
        self.description = description


class Eval:
    def __init__(
            self,
            uuid: str,
            metadata: dict,
            prompts: list[str],
            ideal_responses: list[str],
            tests: list[dict],):
        

SyntaxError: incomplete input (1090252522.py, line 3)

In [5]:
from pydantic import BaseModel, Field

class Metadata(BaseModel):
    name: str | None = None
    description: str | None = None
    difficulty: int | None = None
    tags: list[str] | None = None
    source: str | None = None

In [None]:
from abc import ABC, abstractmethod
from typing import List, Callable, Dict
import functools

# TestResult class to encapsulate the result of each test
class TestResult:
    def __init__(self, passed: bool, description: str):
        self.passed = passed
        self.description = description

# Abstract base class for tests
class Test(ABC):
    @abstractmethod
    def run_test(self, response: str) -> List[TestResult]:
        pass

# Registry for test types
test_registry = {}

# Decorator to register test functions
def register_test(test_type: str):
    def decorator(test_func):
        test_registry[test_type] = test_func
        @functools.wraps(test_func)
        def wrapper(*args, **kwargs):
            return test_func(*args, **kwargs)
        return wrapper
    return decorator

# Specific test implementations
@register_test("match")
class MatchTest(Test):
    def __init__(self, value: str):
        self.value = value

    def run_test(self, response: str) -> List[TestResult]:
        return [TestResult(response == self.value, "Match test")]

@register_test("code_blocks")
class CodeBlockTest(Test):
    # Implementation for code block tests
    ...

@register_test("python")
class PythonFunctionTest(Test):
    # Implementation for python function tests
    ...

@register_test("llm-similarity")
class LLMSimilarityTest(Test):
    # Implementation for LLM similarity tests
    ...

# Test runner
def run_tests(tests_config, response: str):
    results = []
    for test_config in tests_config:
        test_type = test_config['type']
        test_class = test_registry.get(test_type)
        if test_class:
            test = test_class(**test_config)  # Assuming other necessary parameters are passed
            results.extend(test.run_test(response))
    return results

# Example usage
yaml_config = [
    {"type": "match", "value": "Expected response"},
    # Other test configurations
]

response = "Some response to test"
test_results = run_tests(yaml_config, response)


In [None]:
import contextlib
import io
import sys

@register_test("code_blocks")
class PythonCodeBlockTest(Test):
    def __init__(self, setup: str = None, checks: List[str] = None):
        self.setup = setup
        self.checks = checks or []

    def run_test(self, response: str) -> List[TestResult]:
        # Create a separate environment to run the code
        local_env = {}
        results = []

        # Redirect stdout to capture print statements
        stdout = io.StringIO()
        with contextlib.redirect_stdout(stdout):
            try:
                # Execute setup code if present
                if self.setup:
                    exec(self.setup, globals(), local_env)

                # Execute the main code block (response)
                exec(response, globals(), local_env)

                # Execute checks if present
                for check in self.checks:
                    exec(check, globals(), local_env)

                # If no errors, the code block test passes
                results.append(TestResult(True, "Code executed without errors"))
            except Exception as e:
                # If there's an error, the test fails
                results.append(TestResult(False, f"Error executing code: {e}"))

        # Optionally, include captured stdout in the test result
        captured_output = stdout.getvalue()
        if captured_output:
            results.append(TestResult(True, f"Captured stdout: {captured_output}"))

        return results

# Example usage
yaml_config = [
    {"type": "code_blocks", "setup": "import math", "checks": ["assert math.sqrt(4) == 2"]},
]

response = "print('Hello, world!')"
test_results = run_tests(yaml_config, response)


In [None]:
import importlib.util
import types


# TODO this needs to be ran in the same environment as the code block so we need to pass in the local_env


@register_test("python")
class PythonFunctionTest(Test):
    def __init__(self, file: str = None, function: str = None):
        self.file = file
        self.function_code = function

    def run_test(self, response: str) -> List[TestResult]:
        test_function = self._load_function()
        if test_function:
            return test_function(response)
        else:
            return [TestResult(False, "Failed to load test function")]

    def _load_function(self) -> Callable:
        if self.function_code:
            # Execute inline-defined function
            exec(self.function_code)
            return locals()['test_function']
        elif self.file:
            # Dynamically import function from a file
            spec = importlib.util.spec_from_file_location("module.name", self.file)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            return getattr(module, 'test_function')
        else:
            return None

# Example usage
yaml_config = [
    {"type": "python", "function": "def test_function(response): return [TestResult(response == 'expected response', 'Python function test')]"},
]

response = "expected response"
test_results = run_tests(yaml_config, response)
