In [1]:
import warnings
warnings.filterwarnings("ignore")

## 1. Install required python libraries

In [2]:
!pip install -q llama-index openai python-dotenv

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.4/720.4 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.9/265.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.3/129.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Import Packages and set the environment vars

In [3]:
import nest_asyncio
import random
import openai
import time
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.prompts import PromptTemplate
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from google.colab import userdata

nest_asyncio.apply()
openai.api_key = userdata.get('OPENAI_API_KEY') # You can set your keys directly

## 3. Open up an interface for uploading your internal documents

In [4]:
import os
from google.colab import files
uploaded = files.upload()
os.makedirs("/content/data", exist_ok=True)

# Move uploaded file to data folder
for fname in uploaded.keys():
    os.rename(fname, f"/content/data/{fname}")

Saving axis-credit-cards.pdf to axis-credit-cards.pdf


## 4. Read and parse all documents

In [5]:
from llama_index.core import SimpleDirectoryReader
data_dir = "/content/data"
documents = SimpleDirectoryReader(data_dir).load_data()

# Inspect the loaded document
print(f"Loaded {len(documents)} document(s)")
print(documents[0].text[:500])  # Print first 500 chars


Loaded 60 document(s)
Axis Bank|Public 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MOST IMPORTANT 
TERMS AND CONDITIONS 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


## 5. Create a dataset with factual questions for evaluation

#### Prepares 25 QA-style evaluation samples from the top 20 documents using LLM-generated questions

In [6]:
num_eval_questions = 25

eval_documents = documents[0:20]
data_generator = DatasetGenerator.from_documents(eval_documents)
eval_questions = data_generator.generate_questions_from_nodes()
k_eval_questions = random.sample(eval_questions, num_eval_questions)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


## 6. Initialize GPT-4 Model and Faithfulness Evaluator

#### 6.1 Setup GPT4

In [7]:
# We will use GPT-4 for evaluating the responses
gpt4 = OpenAI(temperature=0, model="gpt-4o")

# Set appropriate settings for the LLM
Settings.llm = gpt4

# Define Faithfulness Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator()

#### 6.2 Setup Faithfulness Evaluator

In [8]:
faithfulness_new_prompt_template = PromptTemplate(
    """Please tell if a given piece of information is directly supported by the context.
        You need to answer with either YES or NO.
        Answer YES if any part of the context explicitly supports the information,
        even if most of the context is unrelated.
        If the context does not explicitly support the information, answer NO.
        Some examples are provided below.

        Information: The annual fee for the Flipkart Axis Bank Credit Card is INR 500.
        Context: FLIPKART AXIS BANK Credit Card: 500 Joining Fee, 500 Annual Fee
        (Waived on spends of INR 3,50,000 in the preceding card anniversary year).
        Answer: YES
        Information: Axis Bank charges a joining fee of INR 1,000 for all credit cards.
        Context: JOINING, ANNUAL and ADD-ON CARD FEE — Credit cards have varying fees.
        For example, Flipkart Axis Bank Credit Card charges INR 500 as joining fee,
        Axis Bank Infinite Credit Card charges Nil, and Magnus Credit Card charges INR 12,500.
        Answer: NO

        Information: Fuel surcharge refund is available for transactions
        between INR 400 and INR 4,000 on certain cards.
        Context: Fuel Transaction Surcharge: Refunded for fuel transactions
        Between INR 400 to INR 4,000. Maximum benefits up to INR 400 per Statement Cycle;
        applicable on certain cards like ACE, My Zone, and Airtel Axis Bank Credit Cards.
        Answer: YES

        Information: Magnus Credit Card offers unlimited domestic airport lounge access.
        Context: The document includes detailed charges and fee structures,
        but does not mention lounge access benefits of Magnus Credit Card.
        Answer: NO

        Information: The Axis Bank Reserve Credit Card annual fee is waived on spends
        of INR 35,00,000 in the previous year.
        Context: RESERVE Credit Card: 50,000 Annual Fee (Waived on eligible
         spends of INR 35,00,000
        in the preceding card anniversary year).
        Answer: YES

        Information: Axis Bank does not charge any foreign currency markup for the Magnus Credit Card.
        Context: Foreign Currency Transaction Fee: 2% of the transaction value
        for MAGNUS Credit Card and MAGNUS FOR BURGUNDY Credit Card.
        Answer: NO

        Information: {query_str}
        Context: {context_str}
Answer:
""")
# Update the prompts dictionary with the new prompt template
faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template})

# Define Relevancy Evaluators which are based on GPT-4
relevancy_gpt4 = RelevancyEvaluator()

## 7. Build Query Engine for a given chink size

In [9]:
def build_query_engine(chunk_size, documents):
    """
    Builds a vector-based query engine using a specified chunk size.

    Parameters:
    chunk_size (int): Size of text chunks to index.
    documents (List[Document]): The documents to index.

    Returns:
    query_engine: A configured query engine ready to handle questions.
    """
    llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
    Settings.llm = llm
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size // 5

    vector_index = VectorStoreIndex.from_documents(documents)
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    return query_engine

## 8. Evaluate Response Time and Accuracy Using the Query Engine

In [10]:
def evaluate_response_time_and_accuracy(query_engine, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses
    from a given query engine.

    Parameters:
    query_engine: A configured query engine.
    eval_questions (List[str]): A list of questions to query the engine with.

    Returns:
    tuple: Average response time, average faithfulness score, average relevancy score.
    """
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0
    num_questions = len(eval_questions)

    for question in eval_questions:
        start_time = time.time()
        response = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_gpt4.evaluate_response(response=response).passing
        relevancy_result = relevancy_gpt4.evaluate_response(query=question, response=response).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    return (
        total_response_time / num_questions,
        total_faithfulness / num_questions,
        total_relevancy / num_questions,
    )

## 9. Iterates over different chunk_sizes (128, 256, 512 tokens) and evaluates how each size affects the performance of a RAG system. For each chunk size, it:

1. Builds a vector index using that chunk size
2. Generates answers to a set of evaluation questions
3. Measures:

*   Average response time (in seconds)
*   Average faithfulness score (how factually grounded the answers are)
*   Average relevancy score (how relevant the answers are to the queries)







In [11]:
chunk_sizes = [128, 256, 512]

for chunk_size in chunk_sizes:
    query_engine = build_query_engine(chunk_size, eval_documents)
    avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(query_engine, k_eval_questions)
    print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, \
    Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

Chunk size 128 - Average Response time: 1.40s,     Average Faithfulness: 0.88, Average Relevancy: 0.80
Chunk size 256 - Average Response time: 1.51s,     Average Faithfulness: 0.76, Average Relevancy: 0.72
Chunk size 512 - Average Response time: 1.53s,     Average Faithfulness: 0.92, Average Relevancy: 0.88


## Observations on Chunk Size Impact
#### Larger chunks improve faithfulness
→ Chunk size 512 produced the most factually grounded answers (0.88 faithfulness), likely because more contextual information was available per chunk.

#### Relevancy peaked at a medium chunk size
→ Chunk size 256 achieved the highest relevancy score (0.80), suggesting a balanced trade-off between context richness and noise.

#### Response time increases slightly with chunk size
→ From 1.32s (128) to 1.39s (512), the average response time shows a modest rise, which may scale further with larger inputs or more complex queries.