# Example: Question Answering
A sample eyeball-plus-plus integration for a simple question-answering task

In [None]:
%pip install eyeball_pp openai pyyaml rich

### 1. Define your LLM task
Define the method and add eyeball decorator to record

In [1]:
import eyeball_pp
import openai

openai.api_key = "your-openai-api-key"

# Setting a sample_rate of 1 means that every call to the ask function will be recorded.
# You might want to change this on production to a lower value like 0.1 if you only want to record 10% of the calls.
eyeball_pp.set_config(sample_rate=1)

@eyeball_pp.record_task(input_names=["context", "question"])
def ask(context: str, question: str) -> str:
    # You can write arbitrary code here, the only thing the eval framework
    # cares about is the input and output of this function.
    # In this case the inputs context and question are recorded and the output which is the return value of this function is recorded.

    system = """
    You are trying to answer a question strictly using the information provided in the context. Reply I don't know if you don't know the answer.
    """

    prompt = f"""
    Context: {context}
    Question: {question}
    """

    # eval params can be set when you are trying to evaluate this agent
    # with different parameters eg. different models, providers or hyperparameters like temperature
    model = eyeball_pp.get_eval_param("model") or "gpt-3.5-turbo"
    temperature = eyeball_pp.get_eval_param("temperature") or 0.5

    # Note you can use any arbitrary LLM here, this example uses the openai API but you can 
    # use anthropic claude, or any other open source LLM
    output = openai.ChatCompletion.create(  # type: ignore
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt},
        ],
    )["choices"][0]["message"][
        "content"
    ]  # type: ignore
    return output

### 2. Initial Run
Run the task with a few different inputs.

In [2]:
answer1 = ask(context="The quick brown fox jumps over the lazy dog",
    question="What color is the fox?",
)
print(answer1)

answer2 = ask(context="The lazy dog which is not brown jumps over the quick brown fox",
    question="What color is the dog?",
)
print(answer2)

I don't know.
I don't know.


### 3. Re-run with new Parameters
Rerun recorded examples with different eval params

In [3]:
for input_vars in eyeball_pp.rerun_recorded_examples(
    {"model": "gpt-4", "temperature": 0}
):
    answer = ask(input_vars["context"], input_vars["question"])
    print(f'Answer: "{answer}"')

1it [00:00,  1.21it/s]

answer="The fox is brown."


2it [00:02,  1.48s/it]

answer="The context does not provide information on the color of the dog."





### 4. Evaluate System
Evaluate the LLM results across different runs. You can use built in criteria available in the `eyeball_pp.Criteria` and/or define your own as shown below.

In [6]:
from eyeball_pp import Criteria

eyeball_pp.evaluate_system(
    grading_criteria=[Criteria.CORRECTNESS],
    grading_criteria_custom={"relevance": "Is the response correctly using the information in the context?"}
)

100%|██████████|2/2








