# Package Installation and Imports

The cell below installs all necessary packages required to run this notebook.


In [None]:
# Install required packages
!pip install llama-index openai python-dotenv

In [None]:
import sys
from pathlib import Path

# 1. Define the directory *containing* the all_rag_techniques package
# Get the directory of the current notebook/script (__file__ might not work in some notebooks)
# Assuming the notebook is inside all_rag_techniques/
current_dir = Path.cwd() 

# The directory containing 'all_rag_techniques' is the parent directory
project_root = current_dir.parent 

# 2. Add this root to the system path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print(f"Added project root to path: {project_root}")
else:
    print("Project root already in path.")

# 3. Now the import should work
try:
    from all_rag_techniques import setup_environment, check_keys
    print("✅ Package imported successfully!")
    setup_environment()
    check_keys()
except Exception as e:
    print(f"❌ Final import failed: {e}")

Added project root to path: /Users/ruhwang/Desktop/AI/my_projects/context-engineering/advanced-rag
✅ Package imported successfully!
LANGCHAIN_API_KEY not set (empty in .env file)
Environment setup complete!
=== API Keys from config.py ===
  GROQ_API_KEY: Loaded
  COHERE_API_KEY: Loaded
  OPENAI_API_KEY: Loaded
  LANGCHAIN_API_KEY: Missing

=== Environment Variables ===
  os.environ['GROQ_API_KEY']: Set
  os.environ['COHERE_API_KEY']: Set

All essential keys loaded!


In [None]:
!pip install llama-index

In [8]:
import nest_asyncio
import random

nest_asyncio.apply()
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.prompts import PromptTemplate

from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

import openai
import time
import os
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")




### Read Docs

In [9]:
data_dir = "../data"
documents = SimpleDirectoryReader(data_dir).load_data()

2025-10-23 17:33:29,867 - INFO - NumExpr defaulting to 12 threads.


### Create evaluation questions and pick k out of them

In [None]:
!pip install spacy

In [12]:
num_eval_questions = 25

eval_documents = documents[0:20]
data_generator = DatasetGenerator.from_documents(eval_documents)
eval_questions = data_generator.generate_questions_from_nodes()
k_eval_questions = random.sample(eval_questions, num_eval_questions)

  return cls(
2025-10-23 17:34:05,995 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,092 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,141 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,152 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,264 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,385 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,516 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,560 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:34:06,602 - INFO - HTTP Request: POST https://api.openai.com/v1/chat

### Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context 

In [13]:
from langchain_groq import ChatGroq

In [None]:
!pip install llama-index-llms-langchain

In [22]:
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.core import Settings
from llama_index.core.prompts import PromptTemplate
from groq import Groq # Assuming this is the correct Groq client import

# 1. Initialize your desired LLM (ChatGroq in this case)
# Note: Ensure you have the Groq client library installed and the GROQ_API_KEY environment variable set.
llm = ChatGroq(model="openai/gpt-oss-20b", temperature=0)

# 2. Set the global LLM setting to your new LLM
# The evaluators will use this model by default unless explicitly passed an LLM instance.
Settings.llm = llm

# --- Faithfulness Evaluator Setup ---

# 3. Define the Faithfulness Evaluator
# It will use Settings.llm (which is now ChatGroq) by default
faithfulness_groq = FaithfulnessEvaluator()

# 4. Define your custom prompt template
faithfulness_new_prompt_template = PromptTemplate(""" Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:

    """)

# 5. Update the prompt template
faithfulness_groq.update_prompts({"default": faithfulness_new_prompt_template}) # Use "default" key to override the standard prompt

# --- Relevancy Evaluator Setup ---

# 6. Define Relevancy Evaluator
# It will also use Settings.llm (ChatGroq) by default
relevancy_groq = RelevancyEvaluator()

# Optional alternative: You can also pass the LLM instance directly to the evaluator
# relevancy_groq_direct = RelevancyEvaluator(llm=llm)

### Function to evaluate metrics for each chunk size

In [23]:
# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
# We use GPT-3.5-Turbo to generate response and GPT-4 to evaluate it.
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.
    
    Parameters:
    chunk_size (int): The size of data chunks being processed.
    
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # create vector index
    llm = ChatGroq(model="openai/gpt-oss-20b", temperature=0)

    Settings.llm = llm
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size // 5 

    vector_index = VectorStoreIndex.from_documents(eval_documents)
    
    # build query engine
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_groq.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_groq.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

### Test different chunk sizes 

In [24]:
chunk_sizes = [128, 256]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

2025-10-23 17:47:24,412 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-10-23 17:47:24,807 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-10-23 17:47:26,098 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-10-23 17:47:26,513 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:47:27,124 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:47:27,238 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-10-23 17:47:27,241 - INFO - Retrying request to /openai/v1/chat/completions in 3.000000 seconds
2025-10-23 17:47:30,913 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-23 17:47:31,321 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "H

KeyboardInterrupt: 

![](https://europe-west1-rag-techniques-views-tracker.cloudfunctions.net/rag-techniques-tracker?notebook=all-rag-techniques--choose-chunk-size)