In [1]:
# set path to the root directory of the project
import os
from dotenv import load_dotenv

os.chdir('..')
load_dotenv()

True

# Check Objects

This section shows how to use Check objects directly.

## RegexCheck Example

A RegexCheck checks whether or not the value passed to the check matches the regex pattern provided.

In [2]:
from sik_llm_eval.checks import RegexCheck

check = RegexCheck(pattern=r"\b[a-z]+@[a-z]+\.[a-z]+\b")
result = check("This is an email john@doe.com, the check should succeed.")
assert result.success is True
result.to_dict()

{'value': True,
 'success': True,
 'metadata': {'check_type': 'REGEX',
  'check_pattern': '\\b[a-z]+@[a-z]+\\.[a-z]+\\b',
  'check_negate': False,
  'check_metadata': {}},
 'result_type': 'PASS_FAIL'}

## F1Score Example

The F1Score is measure of overlap between the words in the ideal response and the words in the actual response.

In [3]:
from sik_llm_eval.checks import F1Score

score = F1Score()
result = score(
    actual_response="This is the correct answer.",
    ideal_response="A correct answer was given.",
)
print(f"The F1 score is `{result.value}`.")
result.to_dict()

The F1 score is `0.8`.


{'value': 0.8,
 'metadata': {'check_type': 'F1_SCORE', 'check_metadata': {}},
 'result_type': 'SCORE',
 'success': None}

---

# Evals

In the examples above, we called the Check objects directly, passing in the values to check (i.e. the responses from the LLM/agent).

Typically, you'll want to create an Eval that encapsulates a particular test case and a collection of one or more checks.

Below, we will define an "Eval" which runs all of the checks that are defined on the Eval.

In [4]:
from sik_llm_eval.eval import Eval
from sik_llm_eval.checks import RegexCheck, F1Score

evaluation = Eval(
    ideal_response="This is an exmaple of the ideal response which contains the email jane@doe.com.",
    checks=[
        RegexCheck(pattern=r"\b[a-z]+@[a-z]+\.[a-z]+\b"),
        F1Score(success_threshold=0.5),
    ],
)
fake_response = "This is an another email john@doe.com."
result = evaluation(fake_response)
result.to_dict()

{'eval': {'checks': [{'check_type': 'REGEX',
    'pattern': '\\b[a-z]+@[a-z]+\\.[a-z]+\\b'},
   {'check_type': 'F1_SCORE', 'success_threshold': 0.5}],
  'ideal_response': 'This is an exmaple of the ideal response which contains the email jane@doe.com.'},
 'candidate': None,
 'response': 'This is an another email john@doe.com.',
 'metadata': None,
 'timestamp': '2024-12-11T19:45:54.718711+00:00',
 'check_results': [{'value': True,
   'success': True,
   'metadata': {'check_type': 'REGEX',
    'check_pattern': '\\b[a-z]+@[a-z]+\\.[a-z]+\\b',
    'check_negate': False,
    'check_metadata': {}},
   'result_type': 'PASS_FAIL'},
  {'value': 0.2222222222222222,
   'success': False,
   'metadata': {'check_type': 'F1_SCORE', 'check_metadata': {}},
   'success_threshold': 0.5,
   'result_type': 'SCORE'}]}

In the example above, the `ideal_response` is associated with a particular Eval so that multiple checks can use it.

When using Eval objects, the actual `response`, `ideal_response`, and `input` (`input` is not shown above, but can also be attached to an eval) are sent to Check objects by the Eval. Each check knows which values to use (for example, the F1Score knew where to pull the actual response and ideal response from).

We'll see below that we can modify where the Checks look for the data.

## Structured Response

In the example below, we see an hypothetical response from a RAG agent that first extracts the most relevant document, generates a response based on that document, and then returns the response, the document id that the agent used to generate the response, and some additional metadata that we may or may not use in the Eval.

Here, we want the Checks to use the `generated_response` in the dictionary as the response in the RegexCheck and F1Score, and the `document_id` in the MatchCheck to check that the agent used the expected document based on this particular Eval.

In order to tell the Check objects where to extract the data that's passed to them by the Eval object, we'll set the `data_path` parameter in the Check's `__init__` function. This parameter can be set to a string, a list, or a dictionary, and indicates the "path" to the data we want to extract, relative to the `response`, `input`, `ideal_response`, or `metadata`.

So, for example, the `response` variable sent to the Check from the Eval will contain the dictionary defined in the next cell. We want the RegexCheck and F1Score to use the value corresponding to the `generated_response` key in that dictionary. We also want the MatchCheck to match against corresponding to the `document_id`. When passing a string to `data_path`, a single value will be extracted. When passing a dictionary, like we do with F1Score, the keys will correspond to the parameter names in the corresponding `__call__` function; in this case, it will be the `actual_response` and `ideal_response` of the F1Score `__call__` function. The values are the paths to the data to extract.

In [5]:
# responses can be returned by LLMs and agents in different formats.
fake_agent_response = {
    'generated_response': "This is an example response containing email john@doe.com.",
    # For example, `document_id` could be returned by an agent using RAG to indicate the document
    # that was used to generate the response.
    # This information can be tested by the Eval to check if the correct document was used.
    'document_id': 'doc_123',
    'example_metadata': {
        'cost': 0.5,
        'num_tokens': 100,
        'foo': 'bar',
    },
}

In [6]:
from sik_llm_eval.checks import RegexCheck, MatchCheck, F1Score
from sik_llm_eval.eval import Eval

evaluation = Eval(
    ideal_response="This is an exmaple of the ideal response which contains the email jane@doe.com.",
    checks=[
        RegexCheck(
            data_path="response['generated_response']",
            pattern=r"\b[a-z]+@[a-z]+\.[a-z]+\b",
        ),
        MatchCheck(
            data_path="response['document_id']",
            value="doc_123",
        ),
        F1Score(
            data_path={
                # The keys of the dictionary `actual_response` and `ideal_response` correspond to
                # the arguments of the F1Score's __call__ method
                # The values of the dictionary, `response` and `ideal_response` are properties on
                # the ResponseModel object that is passed to the Check from the Eval object.
                'actual_response': "response['generated_response']",
                'ideal_response': "ideal_response",
            },
            success_threshold=0.5,
        ),
    ],
)

result = evaluation(fake_agent_response)
print(f"Does the response contain an email (RegexCheck)? {result.check_results[0].success}")
print(f"Does the document_id match the expected value (MatchCheck)? {result.check_results[1].success}")
print(f"Is the F1 score greater than the threshold of `{result.check_results[2].success_threshold}`? {result.check_results[2].success}")
print(f"The F1 score is `{result.check_results[2].value:.2f}`.")
print("\n---\n")
result.to_dict()

Does the response contain an email (RegexCheck)? True
Does the document_id match the expected value (MatchCheck)? True
Is the F1 score greater than the threshold of `0.5`? False
The F1 score is `0.36`.

---



{'eval': {'checks': [{'check_type': 'REGEX',
    'data_path': "response['generated_response']",
    'pattern': '\\b[a-z]+@[a-z]+\\.[a-z]+\\b'},
   {'check_type': 'MATCH',
    'data_path': "response['document_id']",
    'value': 'doc_123'},
   {'check_type': 'F1_SCORE',
    'data_path': {'actual_response': "response['generated_response']",
     'ideal_response': 'ideal_response'},
    'success_threshold': 0.5}],
  'ideal_response': 'This is an exmaple of the ideal response which contains the email jane@doe.com.'},
 'candidate': None,
 'response': {'generated_response': 'This is an example response containing email john@doe.com.',
  'document_id': 'doc_123',
  'example_metadata': {'cost': 0.5, 'num_tokens': 100, 'foo': 'bar'}},
 'metadata': None,
 'timestamp': '2024-12-11T19:45:54.738828+00:00',
 'check_results': [{'value': True,
   'success': True,
   'metadata': {'check_type': 'REGEX',
    'check_pattern': '\\b[a-z]+@[a-z]+\\.[a-z]+\\b',
    'check_negate': False,
    'check_metadata':

---

# Evaluating Multiple Evals and "Candidates" w/ EvalHarness

In the examples above, we only evaluated a single Eval against a single LLM or agent.

But typically, we'd like to evaluate many Evals against one or more LLMs/agents and compare the results.

In [7]:
# import nest_asyncio
# nest_asyncio.apply()  # needed for running async in jupyter notebook

In [8]:
from sik_llm_eval.checks import RegexCheck
from sik_llm_eval.eval import Eval

eval_a = Eval(
    metadata={'id': 'shooter'},
    input="In a single sentence, who is the greatest basketball shooter of all time?",
    checks=[RegexCheck(pattern=r"Steph(en)?\sCurry")],
)
eval_b = Eval(
    metadata={'id': 'goat'},
    input="In a single sentence, who is the GOAT of basketball?",
    checks=[RegexCheck(pattern=r"Michael\s(Jeffrey\s)?Jordan")],
)

As a reminder from above, this is what it would like look if we manually passed in a response from a single model to a single Eval. 

We'll use `OpenAICompletion` which is a simple wrapper around OpenAI which makes the input/output of the model easier to work with.

In [9]:
from openai import OpenAI
from sik_llm_eval.openai import OpenAICompletion, user_message

client = OpenAI()
model = OpenAICompletion(
    client=client,
    model='gpt-4o-mini',
    temperature=0.1,
)
messages = [user_message(eval_a.input)]
response = model(messages=messages)
print(response.content)


Many consider Stephen Curry to be the greatest basketball shooter of all time due to his exceptional shooting accuracy, range, and impact on the game.


In [10]:
result = eval_a(response.content)
print(f"Correct response? {result.check_results[0].success}")
result.to_dict()

Correct response? True


{'eval': {'metadata': {'id': 'shooter'},
  'input': 'In a single sentence, who is the greatest basketball shooter of all time?',
  'checks': [{'check_type': 'REGEX', 'pattern': 'Steph(en)?\\sCurry'}]},
 'candidate': None,
 'response': 'Many consider Stephen Curry to be the greatest basketball shooter of all time due to his exceptional shooting accuracy, range, and impact on the game.',
 'metadata': None,
 'timestamp': '2024-12-11T19:45:55.631037+00:00',
 'check_results': [{'value': True,
   'success': True,
   'metadata': {'check_type': 'REGEX',
    'check_pattern': 'Steph(en)?\\sCurry',
    'check_negate': False,
    'check_metadata': {}},
   'result_type': 'PASS_FAIL'}]}

However, if we want to run multiple Evals against multiple LLMs (or agents, or variations of LLM settings such as temperature), we can use the `EvalHarness`.

However, we have one slight problem: different models/APIs/agents/etc will expect different data structures for both inputs and outputs. So how can we ensure that the `input` defined on the Eval object matches the inputs expected by the models, and that the outputs generated by the models are in the correct format that our Evals/Checks expect?

We can use a `Candidate` object, which is just a light-weight wrapper around the model that defines a particular interface.

User's can create custom Candidate objects for their own APIs/agents, or they can use built-in candidates. For example, there is an OpenAICandidate that can be used to evaluate OpenAI against Evals.

However, let's create our own for demonstration purposes. A candidate can be a Candidate object, or it can simply be a callable object that takes the input from the Eval and returns a CandidateResponse object.

Additionally, we can pass metadata to the CandidateResponse so that we can track, for example, token usage and costs associated with the messages.

In [None]:
from sik_llm_eval.candidates import CandidateResponse
from sik_llm_eval.eval import EvalHarness

class MyCandidate:
    def __init__(self, temperature: float):
        self.temperature = temperature

    def __call__(self, input: str) -> CandidateResponse:
        model = OpenAICompletion(
            client=OpenAI(),
            model='gpt-4o-mini',
            temperature=self.temperature,
        )
        response = model(messages=[user_message(input)])
        return CandidateResponse(
            response=response.content,
            metadata=response.usage,
        )

harness = EvalHarness(
    evals=[eval_a, eval_b],
    candidates=[
        MyCandidate(temperature=0.1),
        MyCandidate(temperature=1.0),
    ],
    
)
results = harness()
results

[<sik_llm_eval.eval.CandidateRunResults at 0x1173dfad0>,
 <sik_llm_eval.eval.CandidateRunResults at 0x111b93a90>]

In [12]:
# first candidate, first eval
results[0].eval_results[0].to_dict()

{'eval': {'metadata': {'id': 'shooter'},
  'input': 'In a single sentence, who is the greatest basketball shooter of all time?',
  'checks': [{'check_type': 'REGEX', 'pattern': 'Steph(en)?\\sCurry'}]},
 'candidate': '<__main__.MyCandidate object at 0x1174d0450>',
 'response': 'Many consider Stephen Curry to be the greatest basketball shooter of all time due to his unparalleled shooting accuracy and ability to make shots from long distances.',
 'metadata': {'response_metadata': {'completion_tokens': 28,
   'prompt_tokens': 22,
   'total_tokens': 50,
   'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
   'completion_tokens_details': {'reasoning_tokens': 0,
    'audio_tokens': 0,
    'accepted_prediction_tokens': 0,
    'rejected_prediction_tokens': 0}},
  'response_timestamp': '2024-12-11T19:45:56.468490+00:00'},
 'timestamp': '2024-12-11T19:45:59.306845+00:00',
 'check_results': [{'value': True,
   'success': True,
   'metadata': {'check_type': 'REGEX',
    'check_patte

In [13]:
for r in results:
    eval_results = r.eval_results  # list of EvalResult objects
    num_evals = len(eval_results)
    num_successes = sum([er.check_results[0].success for er in eval_results])
    success_rate = num_successes / num_evals
    print(f"Candidate (temp={r.candidate.temperature}) Success rate: {success_rate:.1%} ({num_successes}/{num_evals})")

Candidate (temp=0.1) Success rate: 100.0% (2/2)
Candidate (temp=1.0) Success rate: 100.0% (2/2)


---

# Evaluating OpenAI `4.0` and `4o-mini` against two evals

In this example, we'll show how to use a built-in Candidate object, the OpenAICandidate, to evaluate `ChatGPT 4.0` against `4o-mini`.

We'll also show how to load evals and candidates from yaml files. Loading candidates from yaml (or from json, which is also supported), requires the Candidate object to use the `@Candidate.register(<candidate name>)` decorator, which tells the Candidate class which sub-class to instantiate when loading from yaml/json or an in-memory python dictionary.

Please refer to the `examples/evals` and `examples/candidates` folder to view the underlying yaml files.

The Evals that are loading are basic python generation prompts (e.g. "Create a python function that does X") along with checks that validate the underlying python code is generated.

In the example below, we also tell the EvalHarness to generate the responses (from the `input` of the `Eval` object) asynchronously, and to run the Evals (i.e. the underlying Checks) using multiprocessing. Running the Checks in parallel is useful for checks that are compute intensive (e.g. extracting and executing Python Code blocks via `PythonCodeBlockTests`, cleaning/tokenizing text via `F1Score`) The asynchronous batch size and number of cpus can be set in the EvalHarness `__init__` method.

In [14]:
import time
from sik_llm_eval.eval import EvalHarness, Mode

harness = EvalHarness(
    response_mode=Mode.ASYNC,
    eval_mode=Mode.PARALLEL,
)
harness.add_evals_from_files('examples/evals/*.yaml')
harness.add_candidate_from_file('examples/candidates/openai_4.0.yaml')
harness.add_candidate_from_file('examples/candidates/openai_4o-mini.yaml')

print("# of Evals: ", len(harness.evals))
print("# of Candidates: ", len(harness.candidates))

print("Starting eval_harness")
start = time.time()
results = harness()  # run the evals
end = time.time()
print(f"Total time: {end - start}")

# of Evals:  3
# of Candidates:  2
Starting eval_harness
Total time: 3.790855407714844e-05


---

The following is a dictionary representation of the third Eval Result (`[2]`) from the first candidate (`[0]`).

In [15]:
results[0].eval_results[2].to_dict()

TypeError: '_asyncio.Task' object is not subscriptable

In [None]:
result = results[0].eval_results[2]
duration = result.metadata['response_metadata']['duration_seconds']
print(f"Duration of 3rd Eval for 1st Candidate: {duration:.2f} seconds")
print(f"Total Runtime of EvalHarness: {end - start:.2f} seconds")

Above, we can see the benefit of running response generation asynchronously. The time it took ChatGPT 4.0 to generate a response for the 3rd Eval was just under the time it took to run all 6 Evaluations (response generation and check evaluation).

---

The following code contains an example of how to summarize the eval results.

The EvalHarness returns a list of lists. The outer list corresponds to each candidate and contains the eval results for that candate. So if there were 5 candidates evaluated the `results` object would be a list of 5 items (which are also lists). If there were 10 evals (evaulated against the 5 candidates) then each inner list would contain 10 `EvalResults` objects.

In [None]:
import pandas as pd

results_summary = []
# each outer list in results corresponds to a candidate
for cand_results in results:
    candidate_name = cand_results.candidate.metadata['name']
    eval_results = cand_results.eval_results
    num_characters = sum(len(r.response) for r in eval_results)
    response_duration = sum(r.metadata['response_metadata']['duration_seconds'] for r in eval_results)
    avg_chars_per_second = num_characters / response_duration
    avg_cost = sum(r.metadata['response_metadata']['total_cost'] for r in eval_results) / len(eval_results)
    num_checks = sum(len(r.check_results) for r in eval_results)
    num_successful_checks = sum(r.num_successful_checks for r in eval_results)
    percent_success = num_successful_checks / num_checks
    results_summary.append({
        'name': candidate_name,
        'Avg chars per second': avg_chars_per_second,
        'Avg cost': avg_cost,
        '# checks': num_checks,
        '# checks passed': num_successful_checks,
        '% checks passed': percent_success,
    })
    print(f"Results for {candidate_name}:")
    print(f"  {num_successful_checks}/{num_checks} ({percent_success:.1%}) successful checks")

pd.DataFrame(results_summary).style.format({
    'Avg chars per second': '{:.1f}',
    'Avg cost': '{:.4f}',
    '% checks passed': '{:.1%}',
})

---