In [None]:
# EvalHarness runs evals asychronously, so we need to install nest_asyncio to avoid errors
# running the evals in a notebook
!pip install nest_asyncio

# Evaluating OpenAI 3.5 and 4.0 against two evals

This example shows how to use the EvalHarness to evaluate OpenAI 3.5 and 4.0 against two fictitious evals. The candidates and evals in this example are defined in yaml files.

Eval and Candidate objects (which take individual or lists of Eval/Candidate/dict objects) can be added to the EvalHarness directly through `add_evals()` and `add_candidates()`. Evals/candidates that are defined in yaml files can be added to the EvalHarness via `add_eval_from_yaml()` which takes a string to the yaml file. Multiple Evals/Candidates defined in yaml files can be added to the EvalHarness with `add_evals_from_yamls()` and `add_candidates_from_yamls()` which take a string containing a directory which will load all yaml files in the directory.

In [4]:
import time
from llm_eval.eval import EvalHarness, EvalResult
import nest_asyncio

nest_asyncio.apply()  # needed for running async in jupyter notebook

def print_result(result: EvalResult) -> None:
    """
    This function is used as a callback and prints the results of each evaluation.

    The callback can also be used, for example, to save the results to a file. If you're
    running a large number of evaluations, you may want to save the results to a file
    periodically in case there are issues/errors before the entire EvalHarness completes.
    """
    print(result)
    print('---')

eval_harness = EvalHarness(callback=print_result)
eval_harness.add_eval_from_yaml("../examples/evals/simple_example.yaml")
eval_harness.add_eval_from_yaml("../examples/evals/mask_emails.yaml")
eval_harness.add_candidate_from_yaml("../examples/candidates/openai_3.5_1106.yaml")
eval_harness.add_candidate_from_yaml("../examples/candidates/openai_4.0_1106.yaml")

print("Starting eval_harness")
start = time.time()
results = eval_harness()
end = time.time()
print(f"Total time: {end - start}")

Starting eval_harness


EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                        Fibonacci Sequence
    # of Prompts Tested:         2
    Cost:                       $0.0008
    Total Response Time:         9.5 seconds
    # of Response Characters:    1,423
    Characters per Second:       150.5
    # of Checks:                 5
    # of Successful Checks:      4
    % of Successful Checks:      80.0%
    # of Code Blocks Generated:  2
    # of Successful Code Blocks: 2
    # of Code Tests Defined:     1
    # of Successful Code Tests:  0
EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                        Python Function to Mask Emails
    # of Prompts Tested:         2
    Cost:                       $0.0007
    Total Response Time:         8.2 seconds
    # of Response Characters:    1,538
    Characters per Second:       188.5
    # of Checks:                 6
    # of Successful Checks:      5
    % of Successful Check

The following code contains an example of how to summarize the eval results.

The EvalHarness returns a list of lists. The outer list corresponds to each candidate and contains the eval results for that candate. So if there were 5 candidates evaluated the `results` object would be a list of 5 items (which are also lists). If there were 10 evals (evaulated against the 5 candidates) then each inner list would contain 10 `EvalResults` objects.

In [21]:
import pandas as pd

results_summary = []
# each outer list in results corresponds to a candidate
for cand_obj, cand_results in zip(eval_harness.candidates, results):
    candidate_name = cand_obj.metadata['name']
    num_checks = sum(r.num_checks for r in cand_results)
    num_successful_checks = sum(r.num_successful_checks for r in cand_results)
    percent_success = num_successful_checks / num_checks
    num_code_blocks_generated = sum(r.num_code_blocks for r in cand_results)
    num_code_blocks_successful = sum(r.get_num_code_blocks_successful() for r in cand_results)
    percent_code_blocks_successful = num_code_blocks_successful / num_code_blocks_generated
    results_summary.append({
        'name': candidate_name,
        '# checks': num_checks,
        '# checks passed': num_successful_checks,
        '% checks passed': percent_success,
        '# code blocks generated': num_code_blocks_generated,
        '# blocks successfully executed': num_code_blocks_successful,
        '% blocks successfully executed': percent_code_blocks_successful,
    })
    print(f"Results for {candidate_name}:")
    print(f"  {num_successful_checks}/{num_checks} ({percent_success:.1%}) successful checks")
    print(f"  {num_code_blocks_successful}/{num_code_blocks_generated} ({percent_code_blocks_successful:.1%}) successful code blocks")  # noqa

pd.DataFrame(results_summary)

Results for OpenAI GPT-3.5-Turbo (1106):
  9/11 (81.8%) successful checks
  3/4 (75.0%) successful code blocks
Results for OpenAI GPT-4.0-Turbo:
  10/11 (90.9%) successful checks
  4/4 (100.0%) successful code blocks


Unnamed: 0,name,# checks,# checks passed,% checks passed,# code blocks generated,# blocks successfully executed,% blocks successfully executed
0,OpenAI GPT-3.5-Turbo (1106),11,9,0.818182,4,3,0.75
1,OpenAI GPT-4.0-Turbo,11,10,0.909091,4,4,1.0


---

# Running a single Eval against a single Candidate

A less common scenario, which might be useful when generating evals or debugging, is running a single Eval against a signle Candidate. Eval objects are callable and can be executed by passing a candidate.

In [23]:
from llm_eval.candidates import OpenAICandidate
from llm_eval.eval import Eval

candidate = OpenAICandidate({'parameters': {'model_name': 'gpt-3.5-turbo-1106'}})
eval_obj = Eval(prompt_sequence={
    'prompt': "Create a python function called `mask_emails` that uses regex to mask all emails.",
    'checks': [
        {'check_type': 'CONTAINS', 'value': 'def mask_emails'},
        {'check_type': 'PYTHON_CODE_BLOCKS_PRESENT'},
    ],
})
result = eval_obj(candidate)
print(result)

EvalResult:
    # of Prompts Tested:         1
    Cost:                       $0.0003
    Total Response Time:         5.4 seconds
    # of Response Characters:    759
    Characters per Second:       141.4
    # of Checks:                 2
    # of Successful Checks:      2
    % of Successful Checks:      100.0%
    # of Code Blocks Generated:  1
