https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5

In [1]:
from dotenv import load_dotenv
load_dotenv()
import os

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [7]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.core.evaluation import DatasetGenerator,FaithfulnessEvaluator,RelevancyEvaluator

In [3]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [4]:
import openai
import time
llm = OpenAI(model="gpt-4o-mini")

# Load Data
Let’s load our document.

In [8]:
documents = SimpleDirectoryReader("small-budget-pdf").load_data()

In [6]:
from llama_index.core.storage.docstore import SimpleDocumentStore
docstore = SimpleDocumentStore()

In [9]:
documents

[Document(id_='58b18d52-913b-4622-ba00-5c1a7dc0f165', embedding=None, metadata={'page_label': '1', 'file_name': 'budget_speech.pdf', 'file_path': 'c:\\Code\\Github\\LlamaIndex\\06.Advanced_Topics\\2.Basic Strategies\\ChunkSize\\small-budget-pdf\\budget_speech.pdf', 'file_type': 'application/pdf', 'file_size': 779225, 'creation_date': '2024-11-02', 'last_modified_date': '2024-11-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='GOVERNMENT OF INDIA\nBUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nJuly 23,  2024', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='5f2c5e0e-357f-41ef-bcf1-1ef8497bb29

# Question Generation
To select the right ```chunk_size```, we'll compute metrics like Average Response time, Faithfulness, and Relevancy for various ```chunk_sizes```. The ```DatasetGenerator``` will help us generate questions from the documents.

In [None]:
data_generator = DatasetGenerator.from_documents(
    documents=documents,
    llm=llm)

In [None]:
eval_questions = data_generator.generate_questions_from_nodes()

In [13]:
eval_questions

['Who is the Minister of Finance presenting the Budget Speech for 2024-2025?',
 'On what date was the Budget Speech delivered?',
 'What is the file size of the document containing the Budget Speech?',
 'What is the file type of the document titled "budget_speech.pdf"?',
 'When was the document "budget_speech.pdf" created?',
 'What is the last modified date of the Budget Speech document?',
 'What is the page label assigned to the document?',
 'What is the name of the government that issued the Budget Speech for 2024-2025?',
 'What is the significance of the date July 23, 2024, in relation to the Budget Speech?',
 'Where is the file "budget_speech.pdf" located on the computer?',
 'What is the file name of the document related to the budget speech?',
 'What is the total file size of the budget speech PDF?',
 'On what date was the budget speech document created?',
 'What is the file type of the document mentioned in the context?',
 'How many pages does the budget speech document contain?',

# Save the generate questions for future

In [None]:
import json

# Save as JSON
with open("./small-budget-pdf/eval_questions-small.json", "w") as file:
    json.dump(eval_questions, file, indent=4)

# Loading a small data set of question for quick testing

In [None]:
import json
# Load questions from the JSON file
with open('./small-budget-pdf/eval_questions-small.json', 'r') as file:
    eval_questions = json.load(file)

eval_questions=eval_questions[:3]
eval_questions

# Setting Up Evaluators
We are setting up the GPT-4 model to serve as the backbone for evaluating the responses generated during the experiment. Two evaluators, FaithfulnessEvaluator and RelevancyEvaluator, are initialised with the service_context .

1. **Faithfulness Evaluator** — It is useful for measuring if the response was hallucinated and measures if the response from a query engine matches any source nodes.
2. **Relevancy Evaluator** — It is useful for measuring if the query was actually answered by the response and measures if the response + source nodes match the query.

In [None]:
# We will use GPT-4 for evaluating the responses
gpt4 = OpenAI(temperature=0, model="gpt-4o-mini")

In [None]:
# Define Faithfulness and Relevancy Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(llm=gpt4)

In [None]:
relevancy_gpt4 = RelevancyEvaluator(llm=gpt4)

# Testing single sample

In [25]:
question="What are the main themes outlined in the budget speech, and how do they relate to the overall budget priorities?"

In [30]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
vector_index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter]
)

In [31]:
query_engine = vector_index.as_query_engine()

In [32]:
response_vector=query_engine.query(question)

In [33]:
response_vector

Response(response="The main themes outlined in the budget speech typically include economic growth, employment generation, infrastructure development, innovation, energy security, and inclusivity in economic progress. These themes relate to the overall budget priorities by emphasizing the government's commitment to fostering a robust economy, creating job opportunities, enhancing public infrastructure, promoting research and development, ensuring energy sustainability, and addressing the needs of diverse population segments. Each theme reflects strategic areas where the government aims to allocate resources and implement policies to achieve long-term economic stability and growth.", source_nodes=[NodeWithScore(node=TextNode(id_='e2007cd4-3cef-4aa3-9df6-fef25b1b8bf1', embedding=None, metadata={'file_path': 'c:\\Code\\Github\\LlamaIndex\\06.Advanced_Topics\\2.Basic Strategies\\ChunkSize\\small-budget-pdf\\eval_questions-small.txt', 'file_name': 'eval_questions-small.txt', 'file_type': 't

In [34]:
eval_result = faithfulness_gpt4.evaluate_response(response=response_vector)


In [40]:
from llama_index.core.evaluation import EvaluationResult
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Response,
)

import pandas as pd

# define jupyter display function
def display_eval_df(response: Response, eval_result: EvaluationResult) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [41]:
display_eval_df(response_vector, eval_result)

Unnamed: 0,Response,Source,Evaluation Result,Reasoning
0,"The main themes outlined in the budget speech typically include economic growth, employment generation, infrastructure development, innovation, energy security, and inclusivity in economic progress. These themes relate to the overall budget priorities by emphasizing the government's commitment to fostering a robust economy, creating job opportunities, enhancing public infrastructure, promoting research and development, ensuring energy sustainability, and addressing the needs of diverse population segments. Each theme reflects strategic areas where the government aims to allocate resources and implement policies to achieve long-term economic stability and growth.","Who is the Minister of Finance presenting the Budget Speech for 2024-2025? On what date was the Budget Speech delivered? What is the file size of the document containing the Budget Speech? What is the file type of the document titled ""budget_speech.pdf""? When was the document ""budget_speech.pdf"" created? What is the last modified date of the Budget Speech document? What is the page label assigned to the document? What is the name of the government that issued the Budget Speech for 2024-2025? What is the significance of the date July 23, 2024, in relation to the Budget Speech? Where is the file ""budget_speech.pdf"" located on the computer? What is the file name of the document related to the budget speech? What is the total file size of the budget speech PDF? On what date was the budget speech document created? What is the file type of the document mentioned in the context? How many pages does the budget speech document contain? What is the file path where the budget speec...",Fail,NO


# Testing over whole data set ausing various chunk size

In [42]:
from llama_index.core.node_parser import SentenceSplitter

# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
def evaluate_response_time_and_accuracy(chunk_size):
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # create vector index
    llm = OpenAI(model="gpt-4o-mini")

    splitter = SentenceSplitter(chunk_size=chunk_size)
    vector_index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter]
)

    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [43]:
for chunk_size in [256, 512, 1024, 2048]:
  avg_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

Chunk size 256 - Average Response time: 1.29s, Average Faithfulness: 0.67, Average Relevancy: 0.67
Chunk size 512 - Average Response time: 1.47s, Average Faithfulness: 0.33, Average Relevancy: 0.67
Chunk size 1024 - Average Response time: 1.23s, Average Faithfulness: 0.67, Average Relevancy: 0.67
Chunk size 2048 - Average Response time: 1.05s, Average Faithfulness: 1.00, Average Relevancy: 1.00
