# RAG Evaluation Baseline - using LlamaIndex
LlamaIndex provides some basic evaluation of query engines! We can setup an evaluator that will measure both hallucinations, as well as if the query was actually answered!

This is provided by two main evaluations:

- `ResponseSourceEvaluator` - uses an LLM to decide if the response is similar enough to the sources -- a good measure for hallunication detection!
- `QueryResponseEvaluator` - uses an LLM to decide if a response is similar enough to the original query -- a good measure for checking if the query was answered!

You may have noticed that we are using an LLM for this task. That means we will want to pick a powerful LLM, like GPT-4 or Claude-2.

Lastly, using these methods, we can also use the LLM to generate syntheic questions to evaluate with!

### Setting up the baseline query engine

In [1]:
import openai
import os
from dotenv import load_dotenv


load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [2]:
from utils.markdown_docs_reader import MarkdownDocsReader
from llama_index.core import SimpleDirectoryReader

def load_markdown_docs(filepath):
    """Load markdown docs from a directory, excluding all other file types."""
    loader = SimpleDirectoryReader(
        input_dir=filepath, 
        required_exts=[".md"],
        file_extractor={".md": MarkdownDocsReader()},
        recursive=True
    )

    documents = loader.load_data()


    ### All these metadata will be useful for embeddings for retrieval. But when it is passed to LLM it is not required.
    # exclude some metadata from the LLM
    for doc in documents:
        doc.excluded_llm_metadata_keys = ["File Name", "Content Type", "Header Path"]

    return documents

In [3]:
## Load our documents from each folder.

base_path = '/Users/raoofmac/Documents/coding/learning/genai'

getting_started_docs =  load_markdown_docs(os.path.join(base_path, "docs/getting_started"))
community_docs =  load_markdown_docs(os.path.join(base_path, "docs/community"))
data_docs =  load_markdown_docs(os.path.join(base_path, "docs/core_modules/data_modules"))
agent_docs =  load_markdown_docs(os.path.join(base_path, "docs/core_modules/agent_modules"))
model_docs =  load_markdown_docs(os.path.join(base_path, "docs/core_modules/model_modules"))
query_docs =  load_markdown_docs(os.path.join(base_path, "docs/core_modules/query_modules"))
supporting_docs =  load_markdown_docs(os.path.join(base_path, "docs/core_modules/supporting_modules"))
tutorial_docs =  load_markdown_docs(os.path.join(base_path, "docs/end_to_end_tutorials"))
contributing_docs =  load_markdown_docs(os.path.join(base_path, "docs/development"))

In [7]:
# from llama_index.core.indices.service_context import ServiceContext, set_global_service_context
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


## Setting up the models for llm and embedding model
Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.0)
# Settings.embed_model = OpenAIEmbedding(
#     model="text-embedding-3-small", embed_batch_size=100
# )
# # maximum input size to the LLM
# Settings.context_window = 4096
# # number of tokens to leave room for the LLM to generate
# Settings.num_output = 256
# Settings.text_splitter = SentenceSplitter(chunk_size=1024)
# Settings.chunk_size = 512
# Settings.chunk_overlap = 20
# Settings.transformations = [SentenceSplitter(chunk_size=1024)]
## Tokenizer
# import tiktoken
# Settings.tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode


In [9]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage

try:
    getting_started_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/getting_started_index"))
    community_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/community_index"))
    data_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/data_index"))
    agent_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/agent_index"))
    model_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/model_index"))
    query_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/query_index"))
    supporting_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/supporting_index"))
    tutorials_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/tutorials_index"))
    contributing_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage/contributing_index"))
except:
    print("Index Not Found! Creating Index")
    getting_started_index = VectorStoreIndex.from_documents(getting_started_docs)
    getting_started_index.storage_context.persist(persist_dir="./storage/getting_started_index")

    community_index = VectorStoreIndex.from_documents(community_docs)
    community_index.storage_context.persist(persist_dir="./storage/community_index")

    data_index = VectorStoreIndex.from_documents(data_docs)
    data_index.storage_context.persist(persist_dir="./storage/data_index")

    agent_index = VectorStoreIndex.from_documents(agent_docs)
    agent_index.storage_context.persist(persist_dir="./storage/agent_index")

    model_index = VectorStoreIndex.from_documents(model_docs)
    model_index.storage_context.persist(persist_dir="./storage/model_index")

    query_index = VectorStoreIndex.from_documents(query_docs)
    query_index.storage_context.persist(persist_dir="./storage/query_index")

    supporting_index = VectorStoreIndex.from_documents(supporting_docs)
    supporting_index.storage_context.persist(persist_dir="./storage/supporting_index")

    tutorials_index = VectorStoreIndex.from_documents(tutorial_docs)
    tutorials_index.storage_context.persist(persist_dir="./storage/tutorials_index")

    contributing_index = VectorStoreIndex.from_documents(contributing_docs)
    contributing_index.storage_context.persist(persist_dir="./storage/contributing_index")

#### Create Query Engine Tools

Since we have a lot of indices, we can create a query engine tool for each and then use them in a single query engine!

In [10]:
from llama_index.core.tools import QueryEngineTool

# create a query engine tool for each folder
getting_started_tool = QueryEngineTool.from_defaults(
    query_engine=getting_started_index.as_query_engine(), 
    name="Getting Started", 
    description="Useful for answering questions about installing and running llama index, as well as basic explanations of how llama index works."
)

community_tool = QueryEngineTool.from_defaults(
    query_engine=community_index.as_query_engine(),
    name="Community",
    description="Useful for answering questions about integrations and other apps built by the community."
)

data_tool = QueryEngineTool.from_defaults(
    query_engine=data_index.as_query_engine(),
    name="Data Modules",
    description="Useful for answering questions about data loaders, documents, nodes, and index structures."
)

agent_tool = QueryEngineTool.from_defaults(
    query_engine=agent_index.as_query_engine(),
    name="Agent Modules",
    description="Useful for answering questions about data agents, agent configurations, and tools."
)

model_tool = QueryEngineTool.from_defaults(
    query_engine=model_index.as_query_engine(),
    name="Model Modules",
    description="Useful for answering questions about using and configuring LLMs, embedding modles, and prompts."
)

query_tool = QueryEngineTool.from_defaults(
    query_engine=query_index.as_query_engine(),
    name="Query Modules",
    description="Useful for answering questions about query engines, query configurations, and using various parts of the query engine pipeline."
)

supporting_tool = QueryEngineTool.from_defaults(
    query_engine=supporting_index.as_query_engine(),
    name="Supporting Modules",
    description="Useful for answering questions about supporting modules, such as callbacks, service context, and avaluation."
)

tutorials_tool = QueryEngineTool.from_defaults(
    query_engine=tutorials_index.as_query_engine(),
    name="Tutorials",
    description="Useful for answering questions about end-to-end tutorials and giving examples of specific use-cases."
)

contributing_tool = QueryEngineTool.from_defaults(
    query_engine=contributing_index.as_query_engine(),
    name="Contributing",
    description="Useful for answering questions about contributing to llama index, including how to contribute to the codebase and how to build documentation."
)

In [16]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        getting_started_tool,
        community_tool,
        data_tool,
        agent_tool,
        model_tool,
        query_tool,
        supporting_tool,
        tutorials_tool,
        contributing_tool
    ],
    # enable this for streaming
    # response_synthesizer=get_response_synthesizer(streaming=True),
    verbose=False
)

### Testing the query engine

In [17]:
response = query_engine.query("How do I install llama index?")
print(str(response))

To install llama index, you can follow these steps:
1. Clone the repository using Git: `git clone https://github.com/jerryjliu/llama_index.git`.
2. If you want to do an editable install, run `pip install -e .`.
3. If you want to install optional dependencies and dependencies used for development, run `pip install -r requirements.txt`.


## Evaluation

#### Generate the dataset

In [19]:
from llama_index.core import Document

documents = SimpleDirectoryReader("../../docs", recursive=True, required_exts=[".md"]).load_data()

all_text = ""

for doc in documents:
    all_text += doc.text

giant_document = Document(text=all_text)

In [23]:
import os
import random
random.seed(42)

from llama_index.core import ServiceContext
from llama_index.core.prompts import Prompt
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import DatasetGenerator

gpt4_service_context = ServiceContext.from_defaults(llm=OpenAI(llm="gpt-4", temperature=0))

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.0)

question_dataset = []
if os.path.exists("question_dataset.txt"):
    with open("question_dataset.txt", "r") as f:
        for line in f:
            question_dataset.append(line.strip())
else:
    # generate questions
    data_generator = DatasetGenerator.from_documents(
        [giant_document],
        text_question_template=Prompt(
            "A sample from the LlamaIndex documentation is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Using the documentation sample, carefully follow the instructions below:\n"
            "{query_str}"
        ),
        question_gen_query=(
            "You are an evaluator for a search pipeline. Your task is to write a single question "
            "using the provided documentation sample above to test the search pipeline. The question should "
            "reference specific names, functions, and terms. Restrict the question to the "
            "context information provided.\n"
            "Question: "
        ),
        # set this to be low, so we can generate more questions
        service_context=gpt4_service_context
    )
    generated_questions = data_generator.generate_questions_from_nodes(num=3)

    # randomly pick 40 questions from each dataset
    # generated_questions = random.sample(generated_questions, )
    question_dataset.extend(generated_questions)

    print(f"Generated {len(question_dataset)} questions.")

    # save the questions!
    with open("question_dataset.txt", "w") as f:
        for question in question_dataset:
            f.write(f"{question.strip()}\n")

  gpt4_service_context = ServiceContext.from_defaults(llm=OpenAI(llm="gpt-4", temperature=0))
  return cls(


Generated 3 questions.


  return QueryResponseDataset(queries=queries, responses=responses_dict)


#### Evaluating Response for Hallucination

In [25]:
import time
import asyncio
import nest_asyncio
nest_asyncio.apply()

from llama_index.core import Response

def evaluate_query_engine(evaluator, query_engine, questions):
    async def run_query(query_engine, q):
        try:
            return await query_engine.aquery(q)
        except:
            return Response(response="Error, query failed.")

    total_correct = 0
    all_results = []
    for batch_size in range(0, len(questions), 3):
        batch_qs = questions[batch_size:batch_size+5]

        tasks = [run_query(query_engine, q) for q in batch_qs]
        responses = asyncio.run(asyncio.gather(*tasks))
        print(f"finished batch {(batch_size // 5) + 1} out of {len(questions) // 5}")

        for response in responses:
            eval_result = 1 if "YES" in evaluator.evaluate(response) else 0
            total_correct += eval_result
            all_results.append(eval_result)
        
        # helps avoid rate limits
        time.sleep(1)

    return total_correct, all_results

In [None]:
from llama_index.core.evaluation import ResponseEvaluator

evaluator = ResponseEvaluator(service_context=gpt4_service_context)
total_correct, all_results = evaluate_query_engine(evaluator, query_engine, question_dataset)

print(f"Hallucination? Scored {total_correct} out of {len(question_dataset)} questions correctly.")

In [27]:
import numpy as np

hallucinated_questions = np.array(question_dataset)[np.array(all_results) == 0]
print(hallucinated_questions)

NameError: name 'all_results' is not defined

Work on the rest!