# Example: Question Answering w/ Langchain
This notebook shows how evaluation can seamlessly be integrated with a standard Langchain question-answering workflow.

In [None]:
%pip install eyeball_pp openai pyyaml rich langchain chromadb

### 1. Index the Content

In [1]:
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma


def _to_documents(content: list[str]) -> list[Document]:
    return [Document(page_content=page) for page in content]

def _from_documents(documents: list[Document]) -> list[str]:
    return [document.page_content for document in documents]


content = [
    "The quick brown fox jumps over the lazy dog",
    "The lazy dog which is not brown jumps over the slow white rabbit",
    "Irrelevant context #1",
    "Irrelevant context #2",
]
texts = _to_documents(content)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

### 1. Define your LLM task
Define the method and add `record_task` decorator to record. Also use `record_intermediary_state` to record metadata that can be used for better evaluations.

In [2]:
import eyeball_pp
import openai

openai.api_key = "your-api-key"

# Setting a sample_rate of 1 means that every call to the ask function will be recorded.
# You might want to change this on production to a lower value like 0.1 if you only want to record 10% of the calls.
eyeball_pp.set_config(sample_rate=1)

# Using this decorator, the input and outputs of the function will be recorded
@eyeball_pp.record_task(args_to_record=["question"])
def ask(question: str) -> str:
    # eval params can be updated externally when you are trying to re-evaluate 
    # this method, as will be shown below
    model = eyeball_pp.get_eval_param("model") or "gpt-3.5-turbo"

    chain = RetrievalQA.from_llm(
        llm=ChatOpenAI(model=model),
        retriever=docsearch.as_retriever(),
        return_source_documents=True)
    result = chain(question)

    # Recording the context as an intermediary state will help the evaluator
    # perform better (eg. by comparing output with the context produced)
    context = _from_documents(result['source_documents'])
    eyeball_pp.record_intermediary_state("context", context)

    return result['result']

### 2. Initial Run
Run the task with a few different inputs.

In [3]:
answer1 = ask("What color is the fox?")
print(f'Answer #1: "{answer1}"')

answer2 = ask("What color is the dog?")
print(f'Answer #2: "{answer2}"')

Answer #1: "The color of the fox is not specified in the given context."
Answer #2: "The color of the dog is not specified in the given context."


### 3. Re-run with new Parameters
Rerun recorded examples with different eval params

In [4]:
for input_vars in eyeball_pp.rerun_recorded_examples(
    {"model": "gpt-4", "temperature": 0}
):
    answer = ask(input_vars["question"])
    print(f'Answer: "{answer}"')

1it [00:01,  1.92s/it]

Answer: "The fox is brown."


2it [00:03,  1.64s/it]

Answer: "The dog is not brown."





### 4. Evaluate System
Evaluate the LLM results across different runs. You can use built in criteria available in the `eyeball_pp.Criteria` and/or define your own as shown below.

In [5]:
from eyeball_pp import Criteria

eyeball_pp.evaluate_system(
    grading_criteria=[Criteria.CORRECTNESS],
    grading_criteria_custom={"relevance": "Is the response correctly using the information in the context?"}
)

  0%|          |0/2

100%|██████████|2/2








