In [2]:
# EvalHarness runs evals asychronously, so we need to install nest_asyncio to avoid errors
# running the evals in a notebook
!pip install nest_asyncio

[0m

In [3]:
import time
from llm_eval.eval import EvalHarness, EvalResult
import nest_asyncio
nest_asyncio.apply()

def print_result(result: EvalResult) -> None:
    """Print the result of an evaluation via callback."""
    print(result)

eval_harness = EvalHarness(callback=print_result)
eval_harness.add_eval_from_yaml('../examples/evals/simple_example.yaml')
eval_harness.add_eval_from_yaml('../examples/evals/mask_emails.yaml')
eval_harness.add_candidate_from_yaml('../examples/candidates/openai_3.5_1106.yaml')
eval_harness.add_candidate_from_yaml('../examples/candidates/openai_4.0_1106.yaml')

print('Starting eval_harness')
start = time.time()
results = eval_harness()
end = time.time()
print(f"Total time: {end - start}")

  from .autonotebook import tqdm as notebook_tqdm


Starting eval_harness
EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                       Fibonacci Sequence
    # of Prompts Tested:        2
    Cost:                       $0.0011
    Total Response Time:        11.6 seconds
    # of Response Characters:   1,361
    # of Code Blocks Generated: 2
    Characters per Second:      117.8
    # of Checks:                5
    # of Successful Checks:     4
    % of Successful Checks:     80.0%
EvalResult:
    Candidate:                  OpenAI GPT-3.5-Turbo (1106)
    Eval:                       Python Function to Mask Emails
    # of Prompts Tested:        2
    Cost:                       $0.0010
    Total Response Time:        13.5 seconds
    # of Response Characters:   1,307
    # of Code Blocks Generated: 2
    Characters per Second:      96.8
    # of Checks:                6
    # of Successful Checks:     5
    % of Successful Checks:     83.3%
EvalResult:
    Candidate:                  OpenAI 

---

In [1]:
from llm_eval.candidates import OpenAICandidate
from llm_eval.eval import Eval

candidate = OpenAICandidate({'parameters': {'model_name': 'gpt-3.5-turbo-1106'}})
eval_obj = Eval(test_sequence={'prompt': "Create a python function called `fib` that takes an integer `n` and returns the `n`th number in the Fibonacci sequence. Use type hints and docstrings."})
result = eval_obj(candidate)
print(result)

  from .autonotebook import tqdm as notebook_tqdm


EvalResult:
    # of Prompts Tested:        1
    Cost:                       $0.0005
    Total Response Time:        4.4 seconds
    # of Response Characters:   734
    # of Code Blocks Generated: 1
    Characters per Second:      168.3
    # of Checks:                0
    # of Successful Checks:     0
    % of Successful Checks:     0.0%
