In [2]:
import yaml
from pprint import pprint

with open('eval_template.yaml') as f:
    eval_template = yaml.safe_load(f)

with open('eval_result_example.yaml') as f:
    eval_result_example = yaml.safe_load(f)

pprint(eval_template)
print('---')
pprint(eval_result_example)

{'metadata': {'description': 'This is an example of an eval. An eval contains '
                             'a set of prompts and tests.',
              'difficulty': 1,
              'name': 'Eval Example',
              'source': 'Anaconda',
              'tags': ['example', 'graphing', 'plotly-express'],
              'uuid': '5f8b1b4e-3b7e-4b0e-8b1b-4e3b7e4b0e8b'},
 'prompts': [{'ideal_response': 'This is the ideal response (to the first '
                                'prompt).\n'
                                'This field is optional.\n'
                                'It is only used if a test below is of type '
                                '`llm-similarity`, in which case it is used as '
                                'the ideal response and the LLM is asked to '
                                'evaluate the similarity of the response to '
                                'the ideal answer.\n'
                                'This value could also be used to fine-tune a

In [4]:
with open('eval_fibonacci.yaml') as f:
    eval_fibonacci = yaml.safe_load(f)
pprint(eval_fibonacci)

{'metadata': {'description': 'This eval tests asks the LLM to create a '
                             'function that returns the nth fibonacci number. '
                             'It then follows up and asks it can create a set '
                             'of assertion states to test the function.',
              'difficulty': 2,
              'name': 'Fibonacci',
              'source': 'Anaconda',
              'tags': ['python'],
              'uuid': 'F392362B-BB18-425B-84F3-385D7B39A0EB'},
 'prompts': [{'ideal_response': "Here's a Python function named `fib` that "
                                'calculates and returns a list of the first '
                                '`n` integers in the Fibonacci sequence. The '
                                'function includes type hints and docstrings '
                                'for clarity.\n'
                                '\n'
                                '```python\n'
                                'def fib(n: int) 

In [10]:
def fib(n: int) -> list[int]:
    """
    Calculate the first n integers in the Fibonacci sequence.

    Args:
        n (int): The number of elements in the Fibonacci sequence to generate.

    Examples:

    >>> fib(5)
    [0, 1, 1, 2, 3]
    >>> fib(10)
    [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
    """
    if n <= 0:
        return []
    elif n == 1:
        return [0]
    else:
        fib_sequence = [0, 1]
        for _ in range(2, n):
            fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])
        return fib_sequence

In [9]:
# Test for typical input values
assert fib(5) == [0, 1, 1, 2, 3], "Test failed for n = 5"
assert fib(10) == [0, 1, 1, 2, 3, 5, 8, 13, 21, 34], "Test failed for n = 10"

# Test for edge cases
assert fib(0) == [], "Test failed for n = 0 (no elements)"
assert fib(1) == [0], "Test failed for n = 1 (single element)"
assert fib(2) == [0, 1], "Test failed for n = 2 (two elements)"

# Test for negative input values
assert fib(-1) == [], "Test failed for n = -1 (negative input)"
assert fib(-10) == [], "Test failed for n = -10 (negative input)"

# Test for large input value
assert len(fib(100)) == 100, "Test failed for n = 100 (large input)"

# Test for non-integer input (should raise a TypeError)
try:
    fib("5")
    assert False, "Test failed for non-integer input (should raise TypeError)"
except TypeError:
    pass

In [14]:
from abc import ABC, abstractmethod
from enum import Enum, auto
import time
from typing import Callable
from pydantic import BaseModel

class TestType(Enum):
    """TODO document."""

    MATCH = auto()
    PYTHON_CODE = auto()
    PYTHON_CODE_BLOCKS = auto()
    LLM = auto()


class TestResult(BaseModel):
    """TODO document."""

    result: bool | int | float | object
    description: str
    metadata: dict = {}


class Prompt(BaseModel):
    """TODO document."""

    prompt: str
    ideal_response: str | None = None


class EvalTest(ABC):
    """TODO document."""

    def __init__(self, eval_uuid: str) -> None:
        super().__init__()
        self.eval_uuid = eval_uuid
        self._result = None

    @abstractmethod
    def __call__(self, responses: list[str]) -> TestResult:
        """TODO document."""


class EvalResult(BaseModel):
    """TODO document."""

    llm_id: str
    eval_id: str
    system: dict
    # potential duplication of information, but i think we need it on this object
    responses: list[str]
    total_time: float
    response_characters: int
    characters_per_second: float
    # this depends on a particular type of test, not sure i like that
    num_code_blocks: int
    code_blocks_passed: int
    test_results: list[TestResult]

# need to Register the different types of Tests

class Eval:
    """
    An Eval defines a set of one or more prompts and tests that can be used to evaluate an LLM. If
    more than one prompt is provided, the intent is evaluate the the conversation and, therefore,
    it's expected that the underlying model/object will maintain state between prompts.

    The Eval object is evaluated by calling it with a single model_id and a callable (wrapping the
    LLM) that takes a prompt (string) and returns a response (string).

    The tests are ran after all the prompts have been evaluated. Each test is passed a list of
    responses (strings) and returns a TestResult object.
    """

    def __init__(
            self,
            uuid: str,
            metadata: dict,
            prompts: list[Prompt],
            tests: list[EvalTest],
            ):
        self.uuid = uuid
        self.metadata = metadata
        self.prompts = prompts
        self.tests = tests
        self.results = None

    @classmethod
    def from_dict(cls, config: dict) -> 'Eval':  # noqa: ANN102
        """Creates an Eval object from a config/dictionary."""
        assert 'uuid' in config, "uuid is a required field when creating an Eval object"
        prompts = [Prompt(**prompt) for prompt in config['prompts']]
        # need to register the different types of tests
        tests = [EvalTest(**test) for test in config['tests']]
        return cls(
            uuid=config['uuid'],
            metadata=config['metadata'] if 'metadata' in config else {},
            prompts=prompts,
            tests=tests,
        )

    def __call__(self, llm_id: str, llm: Callable[[str], str]) -> dict:
        """Evaluates the model against the prompts and tests."""
        start = time.time()
        responses = [llm(p.prompt) for p in self.prompts]
        end = time.time()
        self._duration = end - start

        # TODO
        results = [test(responses) for test in self.tests]

        self.results = EvalResult(
            llm_id=llm_id,
            eval_id=self.uuid,
            system=self.metadata,
            responses=responses,
            total_time=self._duration,
            response_characters=sum([len(r) for r in responses]),
            characters_per_second=sum([len(r) for r in responses]) / self._duration,
            num_code_blocks=0,
            code_blocks_passed=0,
            test_results=results,
        )


In [5]:
from pydantic import BaseModel, Field

class Metadata(BaseModel):
    name: str | None = None
    description: str | None = None
    difficulty: int | None = None
    tags: list[str] | None = None
    source: str | None = None

In [None]:
from abc import ABC, abstractmethod
from typing import List, Callable, Dict
import functools

# TestResult class to encapsulate the result of each test
class TestResult:
    def __init__(self, passed: bool, description: str):
        self.passed = passed
        self.description = description

# Abstract base class for tests
class Test(ABC):
    @abstractmethod
    def run_test(self, response: str) -> List[TestResult]:
        pass

# Registry for test types
test_registry = {}

# Decorator to register test functions
def register_test(test_type: str):
    def decorator(test_func):
        test_registry[test_type] = test_func
        @functools.wraps(test_func)
        def wrapper(*args, **kwargs):
            return test_func(*args, **kwargs)
        return wrapper
    return decorator

# Specific test implementations
@register_test("match")
class MatchTest(Test):
    def __init__(self, value: str):
        self.value = value

    def run_test(self, response: str) -> List[TestResult]:
        return [TestResult(response == self.value, "Match test")]

@register_test("code_blocks")
class CodeBlockTest(Test):
    # Implementation for code block tests
    ...

@register_test("python")
class PythonFunctionTest(Test):
    # Implementation for python function tests
    ...

@register_test("llm-similarity")
class LLMSimilarityTest(Test):
    # Implementation for LLM similarity tests
    ...

# Test runner
def run_tests(tests_config, response: str):
    results = []
    for test_config in tests_config:
        test_type = test_config['type']
        test_class = test_registry.get(test_type)
        if test_class:
            test = test_class(**test_config)  # Assuming other necessary parameters are passed
            results.extend(test.run_test(response))
    return results

# Example usage
yaml_config = [
    {"type": "match", "value": "Expected response"},
    # Other test configurations
]

code_blocks = "Some response to test"
test_results = run_tests(yaml_config, code_blocks)


In [None]:
import contextlib
import io
import sys


class CodeBlocksTestResult:
    def __init__(
            self,
            code_blocks: list[list[str]],
            ran_successfully: list[bool],
            results: list[list[bool]],
            metadata: dict | None = None):
        self.passed = passed
        self.description = description


@register_test("code_blocks")
class PythonCodeBlockTest(Test):
    def __init__(self, setup: str = None, checks: List[str] = None):
        self.setup = setup
        self.checks = checks or []

    def run_test(self, code_blocks: list[list[str]]) -> List[TestResult]:
        # each list item corresponds to a single response and may contain multiple code blocks


        # Create a separate environment to run the code
        local_env = {}
        results = []

        # Redirect stdout to capture print statements
        stdout = io.StringIO()
        with contextlib.redirect_stdout(stdout):
            try:
                # Execute setup code if present
                if self.setup:
                    exec(self.setup, globals(), local_env)

                # Execute the main code block (response)
                exec(code_blocks, globals(), local_env)

                # Execute checks if present
                for check in self.checks:
                    exec(check, globals(), local_env)

                # If no errors, the code block test passes
                results.append(TestResult(True, "Code executed without errors"))
            except Exception as e:
                # If there's an error, the test fails
                results.append(TestResult(False, f"Error executing code: {e}"))

        # Optionally, include captured stdout in the test result
        captured_output = stdout.getvalue()
        if captured_output:
            results.append(TestResult(True, f"Captured stdout: {captured_output}"))

        return results

# Example usage
yaml_config = [
    {"type": "code_blocks", "setup": "import math", "checks": ["assert math.sqrt(4) == 2"]},
]

response = "print('Hello, world!')"
test_results = run_tests(yaml_config, response)


In [None]:
import importlib.util
import types


# TODO this needs to be ran in the same environment as the code block so we need to pass in the local_env


@register_test("python")
class PythonFunctionTest(Test):
    def __init__(self, file: str = None, function: str = None):
        self.file = file
        self.function_code = function

    def run_test(self, response: str) -> List[TestResult]:
        test_function = self._load_function()
        if test_function:
            return test_function(response)
        else:
            return [TestResult(False, "Failed to load test function")]

    def _load_function(self) -> Callable:
        if self.function_code:
            # Execute inline-defined function
            exec(self.function_code)
            return locals()['test_function']
        elif self.file:
            # Dynamically import function from a file
            spec = importlib.util.spec_from_file_location("module.name", self.file)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            return getattr(module, 'test_function')
        else:
            return None

# Example usage
yaml_config = [
    {"type": "python", "function": "def test_function(response): return [TestResult(response == 'expected response', 'Python function test')]"},
]

code_blocks = "expected response"
test_results = run_tests(yaml_config, code_blocks)
