In [None]:
%%bash

# We assume that the haystack-experimental package is already installed.
pip install datasets
pip install sentence-transformers

In [2]:
# Let's set the OpenAI API key environment variable to ensure that
# LLM-based evaluators can query the OpenAI API.
import os
from getpass import getpass
if "OPENAI_API_KEY" not in os.environ:
  os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [3]:
# All the imports that we'll need to create the following:
#  - An indexing pipeline that stores documents from our chosen dataset in a document store.
#  - A retrieval pipeline that uses a query to retrieve relevant documents from the document store.
import json
from typing import List, Dict
from collections import defaultdict
from pathlib import Path
import random
from datasets import load_dataset, Dataset
from tqdm import tqdm

from haystack import Document, Pipeline
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import (
    SentenceTransformersDocumentEmbedder,
    SentenceTransformersTextEmbedder,
)
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers import (
    InMemoryEmbeddingRetriever,
    InMemoryBM25Retriever,
)
from haystack.components.writers import DocumentWriter

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy, DocumentStore
from haystack_experimental.evaluation.harness.rag import (
    RAGEvaluationHarness,
    RAGEvaluationMetric,
    RAGEvaluationInput,
    RAGEvaluationOutput,
    RAGEvaluationOverrides,
)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


## Dataset preparation

The following steps will load the SQUAD dataset, preprocess them for the indexing pipeline and store them to a local folder in the current working directory.

In [4]:
# Helper functions to load the SQUAD dataset.
def aggregate_wiki_title(data: Dataset, agg_wiki_title: Dict[str, Dict[str, List[str]]]):
    for idx, x in enumerate(data.iter(batch_size=1)):
        if x["context"] not in agg_wiki_title[x["title"][0]]["context"]:
            agg_wiki_title[x["title"][0]]["context"].append(x["context"])
        agg_wiki_title[x["title"][0]]["question_answers"].append(
            {"question": x["question"], "answers": x["answers"]}
        )

def load_transformed_squad():
    with open("transformed_squad/questions.jsonl", "r") as f:
        questions = [json.loads(x) for x in f.readlines()]
    for idx, question in enumerate(questions):
        question["query_id"] = f"query_{idx}"

    def create_document(text: str, name: str):
        return Document(content=text, meta={"name": name})

    # walk through the files in the directory and transform each text file into a Document
    documents = []
    for root, dirs, files in os.walk("transformed_squad/articles/"):
        for article in files:
            with open(f"{root}/{article}", "r") as f:
                raw_texts = f.read().split("\n")
                for text in raw_texts:
                    documents.append(
                        create_document(text, article.replace(".txt", ""))
                    )

    return questions, documents

In [5]:
data_train = load_dataset("squad", split="train")
data_validation = load_dataset("squad", split="validation")
agg_wiki_title = defaultdict(
    lambda: {"context": [], "question_answers": [], "text": ""}
)
aggregate_wiki_title(data_train, agg_wiki_title)
aggregate_wiki_title(data_validation, agg_wiki_title)

# merge the context into a single document
for article in tqdm(agg_wiki_title.keys()):
    agg_wiki_title[article]["text"] = "\n".join(
        [x[0] for x in agg_wiki_title[article]["context"]]
    )

# create documents
for article in agg_wiki_title.keys():
    out_path = Path("transformed_squad/articles/")
    out_path.mkdir(parents=True, exist_ok=True)
    with open(f"{str(out_path)}/{article}.txt", "w") as f:
        f.write(agg_wiki_title[article]["text"])

# create question/answers
questions = Path("transformed_squad/")
questions.mkdir(parents=True, exist_ok=True)
with open(f"{str(questions)}/questions.jsonl", "w") as f:
    for article in agg_wiki_title.keys():
        for entry in agg_wiki_title[article]["question_answers"]:
            f.write(
                json.dumps(
                    {
                        "question": entry["question"][0],
                        "document": article,
                        "answers": entry["answers"][0],
                    }
                )
                + "\n"
            )

questions, documents = load_transformed_squad()

100%|██████████| 490/490 [00:00<00:00, 66949.28it/s]


## Indexing pipeline

In [6]:
# Helper function to create a pipeline that indexes the documents in the document store.
def indexing(documents: List[Document]) -> InMemoryDocumentStore:
    document_store = InMemoryDocumentStore()

    doc_writer = DocumentWriter(
        document_store=document_store, policy=DuplicatePolicy.SKIP
    )
    doc_embedder = SentenceTransformersDocumentEmbedder(
        model="sentence-transformers/all-MiniLM-L6-v2"
    )

    ingestion_pipe = Pipeline()
    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")

    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
    ingestion_pipe.run({"doc_embedder": {"documents": documents}})

    return document_store

In [7]:
document_store = indexing(documents)



Batches:   0%|          | 0/662 [00:00<?, ?it/s]

ID '30fa23c3869483ff28c01e6b82ff83b2bba892e2f0805f907b15f7cac4ea6d39' already exists
ID '30fa23c3869483ff28c01e6b82ff83b2bba892e2f0805f907b15f7cac4ea6d39' already exists
ID '2ed2b9d255372bddd91cee3856f43ee0730d8494c960d7ec3c5ab72d1a2b9817' already exists
ID '2ed2b9d255372bddd91cee3856f43ee0730d8494c960d7ec3c5ab72d1a2b9817' already exists
ID '4899256109be91b0cf3e9c9c59020795ad450302840e41882277c0aff619c00e' already exists
ID '09b0f049b862ece73b82870fc4513c3f38cd200f96aba7f437a62dcf1fa62241' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05362' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05362' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05362' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05362' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05362' already exists
ID 'c87db560490e592c8d0461ebcd8d5fe91c5bec2a5287d5790df812ea02e05

## Retrieval pipeline


In [8]:
# Helper function to create an embedding-based RAG pipeline.
def build_emb_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:
    template = """
        You have to answer the following question based on the given context information only.

        Context:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        Question: {{question}}
        Answer:
        """

    pipeline = Pipeline()
    pipeline.add_component(
        "query_embedder",
        SentenceTransformersTextEmbedder(
            model="sentence-transformers/all-MiniLM-L6-v2"
        ),
    )
    pipeline.add_component(
        "retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)
    )
    pipeline.add_component("prompt_builder", PromptBuilder(template=template))
    pipeline.add_component(
        "generator", OpenAIGenerator(model="gpt-3.5-turbo")
    )
    pipeline.add_component("answer_builder", AnswerBuilder())

    pipeline.connect("query_embedder", "retriever.query_embedding")
    pipeline.connect("retriever", "prompt_builder.documents")
    pipeline.connect("prompt_builder", "generator")
    pipeline.connect("generator.replies", "answer_builder.replies")
    pipeline.connect("generator.meta", "answer_builder.meta")
    pipeline.connect("retriever", "answer_builder.documents")

    return pipeline

In [9]:
# Helper function to create an keyword-based RAG pipeline.
def build_keyword_rag_pipeline(document_store: InMemoryDocumentStore, top_k: int = 2) -> Pipeline:
    template = """
        You have to answer the following question based on the given context information only.

        Context:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        Question: {{question}}
        Answer:
        """

    pipeline = Pipeline()
    pipeline.add_component(
        "retriever", InMemoryBM25Retriever(document_store, top_k=top_k)
    )
    pipeline.add_component("prompt_builder", PromptBuilder(template=template))
    pipeline.add_component(
        "generator", OpenAIGenerator(model="gpt-3.5-turbo")
    )
    pipeline.add_component("answer_builder", AnswerBuilder())

    pipeline.connect("retriever", "prompt_builder.documents")
    pipeline.connect("prompt_builder", "generator")
    pipeline.connect("generator.replies", "answer_builder.replies")
    pipeline.connect("generator.meta", "answer_builder.meta")
    pipeline.connect("retriever", "answer_builder.documents")

    return pipeline

In [10]:
emb_rag_pipeline = build_emb_rag_pipeline(document_store, top_k=2)
keyword_rag_pipeline = build_keyword_rag_pipeline(document_store, top_k=2)

## Evaluation harness

The RAG evaluation harness comes with a predefined set of evaluation metrics, which are enumerated in the `RAGEvaluationMetric` enum. 

The `RAGEvaluationHarness` class comes with default initialization functions that can be used with RAG pipelines that use typical names/identifiers for their components.

In [11]:
# Create a harness to evalaute the embedding-based RAG pipeline.
emb_eval_harness = RAGEvaluationHarness.default_with_embedding_retriever(emb_rag_pipeline, metrics={
                                                                RAGEvaluationMetric.DOCUMENT_MAP,
                                                                RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
                                                                RAGEvaluationMetric.ANSWER_FAITHFULNESS
                                                            })
keyword_eval_harness = RAGEvaluationHarness.default_with_keyword_retriever(keyword_rag_pipeline, metrics={
                                                                RAGEvaluationMetric.DOCUMENT_MAP,
                                                                RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
                                                                RAGEvaluationMetric.ANSWER_FAITHFULNESS
                                                            })

In [12]:
# Initialize the inputs to the evaluation harness.
# These inputs will be automatically passed to RAG pipeline 
# and the evaluation pipeline that the harness internally uses.

input_questions = random.sample(questions, 10)

eval_harness_input = RAGEvaluationInput(
    queries=[q["question"] for q in input_questions],
    ground_truth_answers=[q["answers"]["text"][0] for q in input_questions],
    ground_truth_documents=[
        [
            doc
            for doc in document_store.storage.values()
            if doc.meta["name"] == q["document"]
        ]
        for q in input_questions
    ],
    additional_rag_inputs={
        "prompt_builder": {"question": [q["question"] for q in input_questions]},
        "answer_builder": {"query": [q["question"] for q in input_questions]},
    },
)

In [13]:
# Launch an evaluation run with the above inputs.
emb_eval_run = emb_eval_harness.run(inputs=eval_harness_input, run_name="emb_eval_run")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:12<00:00,  1.24s/it]


In [14]:
# Inspect the output of the evaluation run.

print(f"Results of the evaluation run: {emb_eval_run.results.run_name}")
print(f"Serialized RAG pipeline: {emb_eval_run.evaluated_pipeline}")
print(f"Serialized evaluation pipeline: {emb_eval_run.evaluation_pipeline}")
print(f"Inputs: {emb_eval_run.inputs}")

Results of the evaluation run: emb_eval_run
Serialized RAG pipeline: components:
  answer_builder:
    init_parameters:
      pattern: null
      reference_pattern: null
    type: haystack.components.builders.answer_builder.AnswerBuilder
  generator:
    init_parameters:
      api_base_url: null
      api_key:
        env_vars:
        - OPENAI_API_KEY
        strict: true
        type: env_var
      generation_kwargs: {}
      model: gpt-3.5-turbo
      streaming_callback: null
      system_prompt: null
    type: haystack.components.generators.openai.OpenAIGenerator
  prompt_builder:
    init_parameters:
      required_variables: null
      template: "\n        You have to answer the following question based on the\
        \ given context information only.\n\n        Context:\n        {% for document\
        \ in documents %}\n            {{ document.content }}\n        {% endfor %}\n\
        \n        Question: {{question}}\n        Answer:\n        "
      variables: null
    typ

In [15]:
print("Evaluation score report:")
emb_eval_run.results.score_report()

Evaluation score report:


Unnamed: 0,metrics,score
0,metric_doc_recall_single,0.7
1,metric_answer_faithfulness,0.7
2,metric_doc_map,0.6


In [16]:
print("Evaluation score dataframe:")
emb_eval_run.results.to_pandas()

Evaluation score dataframe:


Unnamed: 0,questions,contexts,responses,metric_doc_recall_single,metric_answer_faithfulness,metric_doc_map
0,Upon what are kings of Scots coronated?,"[Normans came into Scotland, building castles ...",The kings of Scots are coronated on the Stone ...,0.0,1.0,0.0
1,Where is the energy stored by a capacitor loca...,[A capacitor (originally known as a condenser)...,The energy stored by a capacitor is located in...,1.0,1.0,0.5
2,What did these gardeners do about unwanted spe...,[Forest gardening was also being used as a foo...,"The gardeners identified, protected, and impro...",1.0,1.0,1.0
3,What is one type of Benedictine order that was...,[The Catholic Church prevailed across Europe a...,One type of Benedictine order that was common ...,1.0,0.0,1.0
4,When did work on the ASCII standard begin?,[ASCII developed from telegraphic codes. Its f...,"Work on the ASCII standard began on October 6,...",1.0,0.0,1.0
5,Where did Africans escape and mate with naitves?,[Numerous communities of dark-skinned peoples ...,Africans escaped and mated with natives in pre...,0.0,1.0,0.0
6,What direction has Europe moved towards?,[Modern historiography on the period has reach...,Europe has moved towards an era characterized ...,0.0,1.0,0.0
7,What was the Office of Special Operations init...,"[US army general Hoyt Vandenberg, the CIG's se...",The initial budget of the Office of Special Op...,1.0,0.0,1.0
8,What is the large art school in Mexico City?,"[During the 19th century, an important produce...",The large art school in Mexico City is the Esc...,1.0,1.0,0.5
9,When did the Hounslow Heath Aerodrome begin to...,"[Following the war, some of these military air...",Hounslow Heath Aerodrome began to operate sche...,1.0,1.0,1.0


In [17]:
# Launch another evaluation run with the same inputs but with different overrides.
overrides = RAGEvaluationOverrides(rag_pipeline={
    "generator": {"model": "gpt-4-turbo"},
})
emb_eval_run_gpt4 = emb_eval_harness.run(inputs=eval_harness_input, run_name="emb_eval_run_gpt4", overrides=overrides)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:12<00:00,  1.25s/it]


In [18]:
# Compare the results of the two evaluation runs.
print("Comparison of the two evaluation runs:")
emb_eval_run.results.comparative_individual_scores_report(emb_eval_run_gpt4.results)

Comparison of the two evaluation runs:


Unnamed: 0,questions,contexts,responses,emb_eval_run_metric_doc_recall_single,emb_eval_run_metric_answer_faithfulness,emb_eval_run_metric_doc_map,emb_eval_run_gpt4_metric_doc_recall_single,emb_eval_run_gpt4_metric_answer_faithfulness,emb_eval_run_gpt4_metric_doc_map
0,Upon what are kings of Scots coronated?,"[Normans came into Scotland, building castles ...",The kings of Scots are coronated on the Stone ...,0.0,1.0,0.0,0.0,1.0,0.0
1,Where is the energy stored by a capacitor loca...,[A capacitor (originally known as a condenser)...,The energy stored by a capacitor is located in...,1.0,1.0,0.5,1.0,1.0,0.5
2,What did these gardeners do about unwanted spe...,[Forest gardening was also being used as a foo...,"The gardeners identified, protected, and impro...",1.0,1.0,1.0,1.0,1.0,1.0
3,What is one type of Benedictine order that was...,[The Catholic Church prevailed across Europe a...,One type of Benedictine order that was common ...,1.0,0.0,1.0,1.0,0.0,1.0
4,When did work on the ASCII standard begin?,[ASCII developed from telegraphic codes. Its f...,"Work on the ASCII standard began on October 6,...",1.0,0.0,1.0,1.0,0.0,1.0
5,Where did Africans escape and mate with naitves?,[Numerous communities of dark-skinned peoples ...,Africans escaped and mated with natives in pre...,0.0,1.0,0.0,0.0,1.0,0.0
6,What direction has Europe moved towards?,[Modern historiography on the period has reach...,Europe has moved towards an era characterized ...,0.0,1.0,0.0,0.0,1.0,0.0
7,What was the Office of Special Operations init...,"[US army general Hoyt Vandenberg, the CIG's se...",The initial budget of the Office of Special Op...,1.0,0.0,1.0,1.0,0.0,1.0
8,What is the large art school in Mexico City?,"[During the 19th century, an important produce...",The large art school in Mexico City is the Esc...,1.0,1.0,0.5,1.0,1.0,0.5
9,When did the Hounslow Heath Aerodrome begin to...,"[Following the war, some of these military air...",Hounslow Heath Aerodrome began to operate sche...,1.0,1.0,1.0,1.0,1.0,1.0


In the above code, we've primarily focused on using the `default_xxx` methods of the `RAGEvaluationHarness` class. They provide a straightforward way of getting started with the evaluation of simple RAG pipelines which use prototypical components. The harness can also be used to evaluate arbitrarily complex RAG pipelines. This is done by providing the harness with some extra metadata about the pipeline to be evaluated.

To use an arbitrary pipeline with the harness, the latter requires information about the following components (c.f `RAGExpectedComponent`):
- Query processor - Component that processes the input query. 
    - Expects one input that contains the query string.
- Document retriever - Component that retrieves documents based on the input query.
    - Expects one output that contains the retrieved documents.
- Response generator - Component that generates responses based on the query and the retrieved documents.
    - Expects one output that contains the LLM's response(s).

For each of the above, the user needs to provide the following metadata (c.f `RAGExpectedComponentMetadata`):
- The name of the component as seen in the pipeline.
- A mapping of the component's expected inputs to their corresponding input names.
- A mapping of the component's expected outputs to their corresponding output names.

For example, let's consider `RAGExpectedComponent.QUERY_PROCESSOR`: Assume we have a RAG pipeline with an [`OpenAITextEmbedder`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L18) component called `"txt_embedder"`. Since the harness is responsible for passing the pipeline's input (the query) to the `OpenAITextEmbedder`, it needs to know the name of the component. Furthermore, it also needs to know the [name of `OpenAITextEmbedder`'s input](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/embedders/openai_text_embedder.py#L135) through which the query should be supplied. The metadata for the above looks thus:
```python
query_processor_metadata = RAGExpectedComponentMetadata(
    name="txt_embedder",
    input_mapping={
        "query": "text"
    }
)
```
Similarly, for `RAGExpectedComponent.DOCUMENT_RETRIEVER`: Assume the RAG pipeline has an [`InMemoryEmbeddingRetriever`](https://github.com/deepset-ai/haystack/blob/0ceeb733baabe2b3658ee7065c4441a632ef465d/haystack/components/retrievers/in_memory/embedding_retriever.py#L12) component named `"mem_retriever"` and is connected to `"txt_embedder"`.
```python
document_retriever_metadata = RAGExpectedComponentMetadata(
    name="mem_retriever",
    output_mapping={
        "retrieved_documents": "documents"
    }
)
```
Both `"query"` and `"retrieved_documents"` are "meta" identifiers used by the harness to specify expected inputs and outputs - They are specific to each `RAGExpectedComponent` enum variant and are documented in their docstrings.

In [19]:
# Create a harness to evalaute a custom RAG pipeline.
# Commented out because the pipeline is not defined in this notebook.

# custom_eval_harness = RAGEvaluationHarness(
#     rag_pipeline=custom_rag_pipeline,
#     rag_components={
#         RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
#             "query_embedder", input_mapping={"query": "text"}
#         ),
#         RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata(
#             "retriever",
#             output_mapping={"retrieved_documents": "documents"},
#         ),
#         RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata(
#             "generator", output_mapping={"replies": "replies"}
#         ),
#     },
#     metrics={
#         RAGEvaluationMetric.DOCUMENT_MAP,
#         RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT,
#         RAGEvaluationMetric.ANSWER_FAITHFULNESS
#     })

There is no strict requirement when it comes which components can act as a query processor, a document retriever or a response generator. For instance, it's perfecty fine if the query processor and the document retriever are the same component. In fact, this is the case when using a keyword-based retriever which directly accepts the query (as opposed to having a query embedder in front of it).