In [1]:
import yaml
from pprint import pprint

with open('eval_template.yaml') as f:
    eval_template = yaml.safe_load(f)

with open('eval_result_example.yaml') as f:
    eval_result_example = yaml.safe_load(f)

pprint(eval_template)
print('---')
pprint(eval_result_example)

{'metadata': {'description': 'This is an example of an eval. An eval contains '
                             'a set of prompts and tests.',
              'difficulty': 1,
              'name': 'Eval Example',
              'source': 'Anaconda',
              'tags': ['example', 'graphing', 'plotly-express'],
              'uuid': '5f8b1b4e-3b7e-4b0e-8b1b-4e3b7e4b0e8b'},
 'prompts': [{'ideal_response': 'This is the ideal response (to the first '
                                'prompt).\n'
                                'This field is optional.\n'
                                'It is only used if a test below is of type '
                                '`llm-similarity`, in which case it is used as '
                                'the ideal response and the LLM is asked to '
                                'evaluate the similarity of the response to '
                                'the ideal answer.\n'
                                'This value could also be used to fine-tune a

In [2]:
with open('eval_fibonacci.yaml') as f:
    eval_fibonacci_config = yaml.safe_load(f)
pprint(eval_fibonacci_config)

{'metadata': {'attribution': 'OpenAI ChatGPT-4 was used to create the initial '
                             'ideal_response values, and were slightly '
                             'modified.',
              'author': 'Anaconda',
              'description': 'Create a function that returns the nth fibonacci '
                             'number. Then, create a set of assertion states '
                             'to test the function.',
              'difficulty': 2,
              'name': 'Fibonacci',
              'tags': ['python']},
 'prompts': [{'ideal_response': "Here's a Python function named `fib` that "
                                'calculates and returns a list of the first '
                                '`n` integers in the Fibonacci sequence. The '
                                'function includes type hints and docstrings '
                                'for clarity.\n'
                                '\n'
                                '```python\n'
         

In [3]:
def fib(n: int) -> list[int]:
    """
    Calculate the first n integers in the Fibonacci sequence.

    Args:
        n (int): The number of elements in the Fibonacci sequence to generate.

    Examples:

    >>> fib(5)
    [0, 1, 1, 2, 3]
    >>> fib(10)
    [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
    """
    if n <= 0:
        return []
    elif n == 1:
        return [0]
    else:
        fib_sequence = [0, 1]
        for _ in range(2, n):
            fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])
        return fib_sequence

In [4]:
# Test for typical input values
assert fib(5) == [0, 1, 1, 2, 3], "Test failed for n = 5"
assert fib(10) == [0, 1, 1, 2, 3, 5, 8, 13, 21, 34], "Test failed for n = 10"

# Test for edge cases
assert fib(0) == [], "Test failed for n = 0 (no elements)"
assert fib(1) == [0], "Test failed for n = 1 (single element)"
assert fib(2) == [0, 1], "Test failed for n = 2 (two elements)"

# Test for negative input values
assert fib(-1) == [], "Test failed for n = -1 (negative input)"
assert fib(-10) == [], "Test failed for n = -10 (negative input)"

# Test for large input value
assert len(fib(100)) == 100, "Test failed for n = 100 (large input)"

# Test for non-integer input (should raise a TypeError)
try:
    fib("5")
    assert False, "Test failed for non-integer input (should raise TypeError)"
except TypeError:
    pass

You're correct to be concerned. The code I provided in the previous example does not inherently prevent the imported module from affecting the current environment. When you import a module using importlib.import_module, it behaves just like a regular import statement. This means that if the module has any code at the top level (outside of function or class definitions), that code will be executed upon import, potentially affecting the global state or the current environment.

To truly isolate the execution of an imported module, you would need a more robust solution, such as running the code in a separate process or using a sandboxing technique. Python's standard library doesn't provide a built-in way to completely sandbox a module, but you can use multiprocessing to achieve a similar effect. Here's an example using multiprocessing:

In [None]:
import multiprocessing
import importlib

def module_function_executor(module_name, function_name, args=(), kwargs={}):
    module = importlib.import_module(module_name)
    func = getattr(module, function_name)
    return func(*args, **kwargs)

def execute_in_process(module_name, function_name, args=(), kwargs={}):
    result_queue = multiprocessing.Queue()
    
    def worker():
        try:
            result = module_function_executor(module_name, function_name, args, kwargs)
            result_queue.put(result)
        except Exception as e:
            result_queue.put(e)
    
    process = multiprocessing.Process(target=worker)
    process.start()
    process.join()

    result = result_queue.get()
    if isinstance(result, Exception):
        raise result
    return result

# Example usage
module_name = 'my_module'  # The name of the Python file without '.py'
function_name = 'my_function'  # The name of the function in the module
result = execute_in_process(module_name, function_name, (arg1,), {'kwarg_name': kwarg_value})
print(result)


In [12]:
import re

# Original function string
original_function_str = """
def some_unknown_function(arg1, arg2):
    return arg1 + arg2
"""

# Replace the original function name with a generic one
generic_function_name = "____test_function____"
modified_function_str = re.sub(r'def \w+', f'def {generic_function_name}', original_function_str)
print(modified_function_str)

# Execute the modified function string
exec(modified_function_str)

# Now call the function
result = locals()[generic_function_name](10, 20)
print(result)



def ____test_function____(arg1, arg2):
    return arg1 + arg2

30


In [95]:
from abc import ABC, abstractmethod
from enum import Enum, auto
import time
from typing import Callable
from pydantic import BaseModel


class TestType(Enum):
    """TODO document."""

    MATCH = auto()
    PYTHON_FUNCTION = auto()
    PYTHON_CODE_BLOCKS = auto()

    @staticmethod
    def to_enum(name: str) -> 'TestType':
        """Get a TestType from its name."""
        if isinstance(name, TestType):
            return name
        try:
            return TestType[name.upper()]
        except KeyError:
            raise ValueError(f"{name.upper()} is not a valid name for a TestType member")


class TestResult(BaseModel):
    """TODO document."""

    result: bool | int | float | object
    description: str
    metadata: dict | None


class EvalTest(ABC):
    """
    An EvalTest corresponds to a single test defined in an Eval (an Eval can have multiple tests).
    The EvalTest is responsible for evaluating the responses to the prompts.
    """

    # TODO: not sure if i need eval_uuid since it's in the EvalResult object
    def __init__(self, eval_uuid: str, metadata: dict | None = None) -> None:
        super().__init__()
        self.eval_uuid = eval_uuid
        self.metadata = metadata or {}
        self.result = None

    @abstractmethod
    def __call__(self, responses: list[str]) -> None:
        """TODO document."""


class TestRegistry:
    """Registry for models."""

    def __init__(self):
        self._registry: dict[str, TestType] = {}

    def register(self, name: str, cls: TestType) -> None:
        """Register a model with the registry."""
        if name in self._registry:
            raise ValueError(f"A model with name '{name}' is already registered.")
        self._registry[name] = cls

    def create_test(self, test_type: TestType, params: dict) -> EvalTest:
        """Create a test from a config."""
        if test_type not in self._registry:
            raise ValueError(f"TestType '{test_type}' not found in registry.")
        return self._registry[test_type](**params)

    def __contains__(self, value: str) -> bool:
        """Check if a model is registered."""
        return value in self._registry


def register_test(test_type: TestType) -> EvalTest:
    """Decorator to register an EvalTest."""
    def decorator(cls: EvalTest) -> EvalTest:
        assert issubclass(cls, EvalTest), \
            f"Test '{test_type}' ({cls.__name__}) must extend TestType"
        assert (test_type not in TEST_REGISTRY), \
            f"Test '{test_type}' already registered."
        TEST_REGISTRY.register(test_type, cls)
        return cls
    return decorator


TEST_REGISTRY = TestRegistry()


@register_test(TestType.MATCH)
class MatchTest(EvalTest):
    """TODO document."""

    def __init__(self,
            eval_uuid: str,
            values: list[str],
            metadata: dict | None = None) -> None:
        super().__init__(eval_uuid=eval_uuid, metadata=metadata)
        self.values = values

    def __call__(self, responses: list[str]) -> None:
        """TODO: document."""
        assert len(responses) == len(self.values), \
            f"Number of responses ({len(responses)}) does not equal number of match values " \
            f"({len(self.values)})"
        # self.results = [r == v if v is not None else None for r, v in zip(responses, self.values)]
        self.results = []
        for r, v in zip(responses, self.values):
            if v is None:
                self.results.append(TestResult(result=None, description="TODO", metadata={}))
            else:
                self.results.append(TestResult(result=r == v, description="TODO", metadata={}))


@register_test(TestType.PYTHON_FUNCTION)
class PythonFunctionTest(EvalTest):
    """
    Runs a Python function (using the LLM responses as input. A Python function is either
    provided as a string, or the name of the function and the file path containing the function.
    A Python function test could be used for anything from a simple regex check to using an LLM
    to evaluate the responses.
    """

    def __init__(self,
            eval_uuid: str,
            function: str | None = None,
            function_name: str | None = None,
            function_file: str | None = None,
            metadata: dict | None = None) -> None:
        super().__init__(eval_uuid=eval_uuid, metadata=metadata)
        if function is None:
            assert function_name is not None and function_file is not None, \
                "Either function or function_name and function_file must be provided."  # noqa: PT018
        self._function_str = function
        self._function_name = function_name
        self._function_file = function_file

    def __call__(self, responses: list[str]) -> None:
        """TODO document."""
        return responses
        # A slightly different requirement is that I have a python file and the name of a function in that file. I need to dynamically import everything in that file and execute the provided function, while passing in arguments. I don't want anything imported to affect the environment that is running it.


@register_test(TestType.PYTHON_CODE_BLOCKS)
class PythonCodeBlocksTest(EvalTest):
    """
    This class is responsible for executing Python code blocks returned by the LLM and then
    running the python function(s) defined in the test in the same environment as code blocks.
    For example, if the code blocks define a pandas DataFrame, the function could be used to
    check that the shape or data of the DataFrame matches expectations.

    The difference between this class and PythonFunctionTest is that this class is responsible
    for running tests against the code blocks returned by the LLM, whereas PythonFunctionTest
    is responsible for running tests against the (string) responses returned by the LLM.
    """  # noqa: D404

    def __init__(self,
            eval_uuid: str,
            code_setup: str | None = None,
            checks: list[dict] | None = None,
            # function: str | None = None,
            # function_name: str | None = None,
            # function_file: str | None = None,
            metadata: dict | None = None) -> None:
        super().__init__(eval_uuid=eval_uuid, metadata=metadata)
        # if function is None:
        #     assert function_name is not None and function_file is not None, \
        #         "Either function or function_name and function_file must be provided."  # noqa: PT018
        # self._function_str = function
        # self._function_name = function_name
        # self._function_file = function_file
        self._checks = checks
        self._code_setup = code_setup

    def __call__(self, responses: list[str]) -> None:
        """TODO document."""
        # extract code blocks
        # run code setup if provided
        # run code blocks
        # run function in same environent as code blocks
        pass


class Prompt(BaseModel):
    """TODO document."""

    prompt: str
    ideal_response: str | None = None


class EvalResult(BaseModel):
    """
    An EvalResult is the result of evaluating a specific LLM against a specific Eval, potentially
    using specific hardware. The hardware is not applicable for services like OpenAI's API, but
    would be applicable for running locally or against specific/configurable hardware like Hugging
    Face Endpoints or a custom server.
    """

    llm_id: str
    eval_id: str
    system: dict
    # potential duplication of information, but i think we need it on this object
    responses: list[str]
    total_time: float
    response_characters: int
    characters_per_second: float
    # this depends on a particular type of test, not sure i like that
    num_code_blocks: int
    code_blocks_passed: int
    test_results: list[object]

# need to Register the different types of Tests

class Eval:
    """
    An Eval defines a set of one or more prompts and tests that can be used to evaluate an LLM. If
    more than one prompt is provided, the intent is evaluate the the conversation and, therefore,
    it's expected that the underlying model/object will maintain state between prompts.

    The Eval object is evaluated by calling it with a single model_id and a callable (wrapping the
    LLM) that takes a prompt (string) and returns a response (string).

    An Eval corresponds to a set of prompts, while the result of the Eval corresponds to the Eval
    and a specific LLM, and potentially specific to the hardware used to run the LLM.

    The tests are ran after all the prompts have been evaluated. Each test is passed a list of
    responses (strings) and returns a TestResult object.
    """

    def __init__(
            self,
            uuid: str,
            metadata: dict,
            prompts: list[Prompt],
            tests: list[EvalTest],
            ):
        self.uuid = uuid
        self.metadata = metadata
        self.prompts = prompts
        self.tests = tests
        self.results = None

    @classmethod
    def from_dict(cls, config: dict, results: dict | None = None) -> 'Eval':  # noqa: ANN102
        """Creates an Eval object from a config/dictionary."""
        assert 'uuid' in config, "uuid is a required field when creating an Eval object"
        prompts = [Prompt(**prompt) for prompt in config['prompts']]
        # need to register the different types of tests
        # tests = [EvalTest(**test) for test in config['tests']]
        tests = []
        for test in config['tests']:
            test['eval_uuid'] = config['uuid']
            tests.append(TEST_REGISTRY.create_test(
                test_type=TestType.to_enum(test.pop('type')),
                params=test,
            ))
        # tests = [
        #     TEST_REGISTRY.create_test(test_type=TestType.to_enum(t.pop('type')), params=t)
        #     for t in config['tests']
        # ]
        obj = cls(
            uuid=config['uuid'],
            metadata=config['metadata'] if 'metadata' in config else {},
            prompts=prompts,
            tests=tests,
        )
        if results is not None:
            obj.results = EvalResult(**results)
        return obj


    def __call__(self, llm_id: str, llm: Callable[[str], str]) -> dict:
        """Evaluates the model against the prompts and tests."""
        start = time.time()
        responses = [llm(p.prompt) for p in self.prompts]
        end = time.time()
        self._duration = end - start

        # TODO
        results = [test(responses) for test in self.tests]

        self.results = EvalResult(
            llm_id=llm_id,
            eval_id=self.uuid,
            system=self.metadata,
            responses=responses,
            total_time=self._duration,
            response_characters=sum([len(r) for r in responses]),
            characters_per_second=sum([len(r) for r in responses]) / self._duration,
            num_code_blocks=0,
            code_blocks_passed=0,
            test_results=results,
        )

    def __str__(self) -> str:
        """Returns a string representation of the Eval."""
        from textwrap import dedent
        prompts = ',\n                '.join([str(p) for p in self.prompts])
        metadata = '' if not self.metadata else f'\n            metadata={self.metadata},'
        return dedent(f"""
        Eval(
            uuid={self.uuid},{metadata}
            prompts=[
                {prompts}
            ],
            tests=[{', '.join([str(type(t)) for t in self.tests])}]
        )
        """).strip()

In [96]:
with open('eval_fibonacci.yaml') as f:
    eval_fibonacci_config = yaml.safe_load(f)

eval_fib = Eval.from_dict(eval_fibonacci_config)
eval_fib(llm_id='test', llm=lambda x: x)

In [97]:
print(eval_fib)

Eval(
    uuid=F392362B-BB18-425B-84F3-385D7B39A0EB,
    metadata={'name': 'Fibonacci', 'description': 'Create a function that returns the nth fibonacci number. Then, create a set of assertion states to test the function.', 'author': 'Anaconda', 'attribution': 'OpenAI ChatGPT-4 was used to create the initial ideal_response values, and were slightly modified.', 'difficulty': 2, 'tags': ['python']},
    prompts=[
        prompt='Create a python function named `fib` that calculates and returns a list of the first `n` integers in the fibonacci sequence. Use type hints and docstrings.' ideal_response='Here\'s a Python function named `fib` that calculates and returns a list of the first `n` integers in the Fibonacci sequence. The function includes type hints and docstrings for clarity.\n\n```python\ndef fib(n: int) -> list[int]:\n  """\n  Calculate the first n integers in the Fibonacci sequence.\n\n  Args:\n      n (int): The number of elements in the Fibonacci sequence to generate.\n\n  Exa

In [8]:
pprint(eval_fib.results)

EvalResult(llm_id='test', eval_id='F392362B-BB18-425B-84F3-385D7B39A0EB', system={'name': 'Fibonacci', 'description': 'Create a function that returns the nth fibonacci number. Then, create a set of assertion states to test the function.', 'author': 'Anaconda', 'attribution': 'OpenAI ChatGPT-4 was used to create the initial ideal_response values, and were slightly modified.', 'difficulty': 2, 'tags': ['python']}, responses=['Create a python function named `fib` that calculates and returns a list of the first `n` integers in the fibonacci sequence. Use type hints and docstrings.', 'Create a set of assertion statements to test the function including all edge-cases.'], total_time=3.0994415283203125e-06, response_characters=238, characters_per_second=76788027.07692307, num_code_blocks=0, code_blocks_passed=0, test_results=[])


In [104]:
eval_fib.__repr__()

'<__main__.Eval object at 0xffff8723e110>'

In [5]:
from pydantic import BaseModel, Field

class Metadata(BaseModel):
    name: str | None = None
    description: str | None = None
    difficulty: int | None = None
    tags: list[str] | None = None
    source: str | None = None

In [None]:
from abc import ABC, abstractmethod
from typing import List, Callable, Dict
import functools

# TestResult class to encapsulate the result of each test
class TestResult:
    def __init__(self, passed: bool, description: str):
        self.passed = passed
        self.description = description

# Abstract base class for tests
class Test(ABC):
    @abstractmethod
    def run_test(self, response: str) -> List[TestResult]:
        pass

# Registry for test types
test_registry = {}

# Decorator to register test functions
def register_test(test_type: str):
    def decorator(test_func):
        test_registry[test_type] = test_func
        @functools.wraps(test_func)
        def wrapper(*args, **kwargs):
            return test_func(*args, **kwargs)
        return wrapper
    return decorator

# Specific test implementations
@register_test("match")
class MatchTest(Test):
    def __init__(self, value: str):
        self.value = value

    def run_test(self, response: str) -> List[TestResult]:
        return [TestResult(response == self.value, "Match test")]

@register_test("code_blocks")
class CodeBlockTest(Test):
    # Implementation for code block tests
    ...

@register_test("python")
class PythonFunctionTest(Test):
    # Implementation for python function tests
    ...

@register_test("llm-similarity")
class LLMSimilarityTest(Test):
    # Implementation for LLM similarity tests
    ...

# Test runner
def run_tests(tests_config, response: str):
    results = []
    for test_config in tests_config:
        test_type = test_config['type']
        test_class = test_registry.get(test_type)
        if test_class:
            test = test_class(**test_config)  # Assuming other necessary parameters are passed
            results.extend(test.run_test(response))
    return results

# Example usage
yaml_config = [
    {"type": "match", "value": "Expected response"},
    # Other test configurations
]

code_blocks = "Some response to test"
test_results = run_tests(yaml_config, code_blocks)


In [None]:
import contextlib
import io
import sys


class CodeBlocksTestResult:
    def __init__(
            self,
            code_blocks: list[list[str]],
            ran_successfully: list[bool],
            results: list[list[bool]],
            metadata: dict | None = None):
        self.passed = passed
        self.description = description


@register_test("code_blocks")
class PythonCodeBlockTest(Test):
    def __init__(self, setup: str = None, checks: List[str] = None):
        self.setup = setup
        self.checks = checks or []

    def run_test(self, code_blocks: list[list[str]]) -> List[TestResult]:
        # each list item corresponds to a single response and may contain multiple code blocks


        # Create a separate environment to run the code
        local_env = {}
        results = []

        # Redirect stdout to capture print statements
        stdout = io.StringIO()
        with contextlib.redirect_stdout(stdout):
            try:
                # Execute setup code if present
                if self.setup:
                    exec(self.setup, globals(), local_env)

                # Execute the main code block (response)
                exec(code_blocks, globals(), local_env)

                # Execute checks if present
                for check in self.checks:
                    exec(check, globals(), local_env)

                # If no errors, the code block test passes
                results.append(TestResult(True, "Code executed without errors"))
            except Exception as e:
                # If there's an error, the test fails
                results.append(TestResult(False, f"Error executing code: {e}"))

        # Optionally, include captured stdout in the test result
        captured_output = stdout.getvalue()
        if captured_output:
            results.append(TestResult(True, f"Captured stdout: {captured_output}"))

        return results

# Example usage
yaml_config = [
    {"type": "code_blocks", "setup": "import math", "checks": ["assert math.sqrt(4) == 2"]},
]

response = "print('Hello, world!')"
test_results = run_tests(yaml_config, response)


In [None]:
import importlib.util
import types


# TODO this needs to be ran in the same environment as the code block so we need to pass in the local_env


@register_test("python")
class PythonFunctionTest(Test):
    def __init__(self, file: str = None, function: str = None):
        self.file = file
        self.function_code = function

    def run_test(self, response: str) -> List[TestResult]:
        test_function = self._load_function()
        if test_function:
            return test_function(response)
        else:
            return [TestResult(False, "Failed to load test function")]

    def _load_function(self) -> Callable:
        if self.function_code:
            # Execute inline-defined function
            exec(self.function_code)
            return locals()['test_function']
        elif self.file:
            # Dynamically import function from a file
            spec = importlib.util.spec_from_file_location("module.name", self.file)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            return getattr(module, 'test_function')
        else:
            return None

# Example usage
yaml_config = [
    {"type": "python", "function": "def test_function(response): return [TestResult(response == 'expected response', 'Python function test')]"},
]

code_blocks = "expected response"
test_results = run_tests(yaml_config, code_blocks)
