In [1]:
# EvalHarness runs evals asychronously, so we need to install nest_asyncio to avoid errors
# running the evals in a notebook
!pip install nest_asyncio

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import time
from llm_eval.eval import EvalHarness, EvalResult
import nest_asyncio
nest_asyncio.apply()

def print_result(result: EvalResult) -> None:
    """Print the result of an evaluation via callback."""
    print(result)

eval_harness = EvalHarness(callback=print_result)
eval_harness.add_eval_from_yaml('../examples/evals/simple_example.yaml')
eval_harness.add_eval_from_yaml('../examples/evals/mask_emails.yaml')
eval_harness.add_candidate_from_yaml('../examples/candidates/openai_3.5_1106.yaml')
eval_harness.add_candidate_from_yaml('../examples/candidates/openai_4.0_1106.yaml')

print('Starting eval_harness')
start = time.time()
results = eval_harness()
end = time.time()
print(f"Total time: {end - start}")

  from .autonotebook import tqdm as notebook_tqdm


Starting eval_harness
EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                       Fibonacci Sequence
    # of Prompts Tested:        2
    Cost:                       $0.0011
    Total Response Time:        5.7 seconds
    # of Response Characters:   1,270
    Characters per Second:      220.9
    # of Code Blocks Generated: 2
    # of Checks:                5
    # of Successful Checks:     4
    % of Successful Checks:     80.0%
EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                       Python Function to Mask Emails
    # of Prompts Tested:        2
    Cost:                       $0.0010
    Total Response Time:        6.2 seconds
    # of Response Characters:   1,306
    Characters per Second:      211.4
    # of Code Blocks Generated: 2
    # of Checks:                6
    # of Successful Checks:     5
    % of Successful Checks:     83.3%
EvalResult:
    Candidate:                  OpenAI G

In [52]:
import numpy as np

check_results = {
        cand_results[0].candidate_obj.metadata['name']:
        [x.success for result in cand_results for x in result.all_check_results]
    for cand_results in results
}
{k:np.mean(v) for k, v in check_results.items()}

{'OpenAI GPT-3.5-Turbo (1106)': 0.8181818181818182,
 'OpenAI GPT-4.0-Turbo (1106)': 0.7272727272727273}

In [53]:
results[0][0].to_dict()

{'eval_obj': {'metadata': {'name': 'Fibonacci Sequence'},
  'test_sequence': [{'prompt': 'Create a python function called `fib` that takes an integer `n` and returns the `n`th number in the Fibonacci sequence. Use type hints and docstrings.',
    'checks': [{'pattern': 'def fib\\([a-zA-Z_]+\\: int\\) -> int\\:',
      'check_type': 'REGEX'},
     {'check_type': 'PYTHON_CODE_BLOCKS_PRESENT'}]},
   {'prompt': 'Create a set of assertion statements that test the function.',
    'checks': [{'value': 'assert fib(', 'check_type': 'CONTAINS'},
     {'check_type': 'PYTHON_CODE_BLOCKS_PRESENT'},
     {'code_setup': 'import re\n',
      'code_tests': ["def verify_mask_emails_with_no_email_returns_original_string(code_blocks: list[str]) -> bool:\n    value = 'This is a string with no email addresses'\n    return mask_emails(value) == value"],
      'check_type': 'PYTHON_CODE_BLOCK_TESTS'}]}]},
 'candidate_obj': {'metadata': {'name': 'OpenAI GPT-3.5-Turbo (1106)'},
  'parameters': {'model_name': 'g

---

In [4]:
from llm_eval.candidates import OpenAICandidate
from llm_eval.eval import Eval

candidate = OpenAICandidate({'parameters': {'model_name': 'gpt-3.5-turbo-1106'}})
eval_obj = Eval(test_sequence={'prompt': "Create a python function called `fib` that takes an integer `n` and returns the `n`th number in the Fibonacci sequence. Use type hints and docstrings."})
result = eval_obj(candidate)
print(result)

EvalResult:
    # of Prompts Tested:        1
    Cost:                       $0.0003
    Total Response Time:        3.5 seconds
    # of Response Characters:   635
    Characters per Second:      179.5
    # of Code Blocks Generated: 1
    # of Checks:                0
    # of Successful Checks:     0
    % of Successful Checks:     0.0%
