One of the main use-cases of the `llm-eval` framework to allow users to define and run many Evals against many Candidates via yaml files. The Check and Candidate classes can be serialized and deserialized via `to_dict()`/`from_dict()`, and a registration system is used to instantiate the Check/Candidate objects when deserialized.

However, the framework also allows users to define Checks/Candidates through code using callable objects such as lambda functions. Users can also define tests where the prompt is any type of object (and not restricted to string/dict/numeric as it is when defining Evals in yaml files) and Candidates can return any type of eval. This functionality allows users to easily define and run Evals in code (e.g. in unit tests).

In [5]:
from llm_eval.eval import PromptTest, Eval

def fake_candidate(prompt: dict) -> dict:
    return {'my_response': f"This is a fake response for the prompt: '{prompt}'."}

test = PromptTest(
    prompt={'my_prompt': "This is a user's prompt."},
    checks=[
        # a DataRequest object is passed to all checks (Check or callable) from the Eval
        lambda data: 'fake response' in data.response,
        lambda data: len(data.code_blocks) == 0,
    ],
)
eval_ = Eval(test)
result = eval_(fake_candidate)
print(result)

EvalResult:
    # of Prompts Tested:         1
    Total Response Time:         0.0 seconds
    # of Response Characters:    N/A
    Characters per Second:       N/A
    # of Checks:                 2
    # of Successful Checks:      1
    % of Successful Checks:      50.0%
    # of Code Blocks Generated:  0


In [11]:
from dataclasses import dataclass
from llm_eval.eval import PromptTest, Eval
from llm_eval.eval import EvalHarness, EvalResult
import nest_asyncio

nest_asyncio.apply()  # needed for running async in jupyter notebook

# we are not limited to strings for prompts/responses; we can use any object
@dataclass
class CustomRequest:
    prompt: str

@dataclass
class CustomResponse:
    llm_reply: str

# Candidates can be any callable that takes a single value and returns a single value. Candidates
# can delegate to other agents, APIs, etc.
def fake_candidate_1(request: CustomRequest) -> CustomResponse:
    return CustomResponse(llm_reply=f"Candidate 1 fake repsonse: '{request.prompt}'.")

def fake_candidate_2(request: CustomRequest) -> CustomResponse:
    return CustomResponse(llm_reply=f"Candidate 2 fake response: '{request.prompt}'.")

# define Evals
test_1 = PromptTest(
    prompt=CustomRequest(prompt="This is a user's prompt for test 1."),
    checks=[
        # a DataRequest object is passed to all checks (Check or callable) from the Eval
        lambda data: 'test 1' in data.response.llm_reply,
        lambda data: 'test 2' not in data.response.llm_reply,
        lambda data: len(data.code_blocks) == 0,
    ],
)
test_2 = PromptTest(
    prompt=CustomRequest(prompt="This is a user's prompt for test 2."),
    checks=[
        # a DataRequest object is passed to all checks (Check or callable) from the Eval
        lambda data: 'test 2' in data.response.llm_reply,
        lambda data: 'test 1' not in data.response.llm_reply,
        lambda data: len(data.code_blocks) == 0,
    ],
)
# run the Evals via EvalHarness
harness = EvalHarness(
    # note: we cannot picke lambdas (used in the checks) so in this example we are limited to
    # a single CPU
    num_cpus=1,
    evals=[Eval(test_1), Eval(test_2)],
    candidates=[fake_candidate_1, fake_candidate_2],
)
print("# of Evals: ", len(harness.evals))
print("# of Candidates: ", len(harness.candidates))
results = harness()  # run the evals

for candidate_result in results:
    for result in candidate_result:
        print(result)

# of Evals:  2
# of Candidates:  2
EvalResult:
    # of Prompts Tested:         1
    Total Response Time:         0.0 seconds
    # of Response Characters:    N/A
    Characters per Second:       N/A
    # of Checks:                 3
    # of Successful Checks:      3
    % of Successful Checks:      100.0%
    # of Code Blocks Generated:  0
EvalResult:
    # of Prompts Tested:         1
    Total Response Time:         0.0 seconds
    # of Response Characters:    N/A
    Characters per Second:       N/A
    # of Checks:                 3
    # of Successful Checks:      3
    % of Successful Checks:      100.0%
    # of Code Blocks Generated:  0
EvalResult:
    # of Prompts Tested:         1
    Total Response Time:         0.0 seconds
    # of Response Characters:    N/A
    Characters per Second:       N/A
    # of Checks:                 3
    # of Successful Checks:      3
    % of Successful Checks:      100.0%
    # of Code Blocks Generated:  0
EvalResult:
    # of Prompts Tes