In [1]:
!pip install llama-index

Collecting llama-index
  Downloading llama_index-0.10.43-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.7-py3-none-any.whl (12 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-core==0.10.43 (from llama-index)
  Downloading llama_index_core-0.10.43-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.10-py3-none-any.whl (6.2 kB)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl (6.7 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_i

In [2]:
from llama_index.core import SimpleDirectoryReader

In [3]:
reader = SimpleDirectoryReader(
    input_files=["/content/drive/MyDrive/policy-booklet-0923.pdf"]
)

In [4]:
docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 44 docs


In [5]:
import os
import nest_asyncio
import openai
import time
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,PromptTemplate
from llama_index.core.evaluation import DatasetGenerator, FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.llms.openai import OpenAI
from secret_key import my_openapi_key

nest_asyncio.apply()


os.environ["OPENAI_API_KEY"] = "my_openapi_key"


In [6]:
# Generate a set of 20 questions from the first 20 pages
eval_documents = docs[:20]
data_generator = DatasetGenerator.from_documents(eval_documents)
eval_questions = data_generator.generate_questions_from_nodes(num=20)

# Use GPT-4 for evaluating the responses
gpt4 = OpenAI(temperature=0, model="gpt-4")
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)

# Define Faithfulness and Relevancy Evaluators based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)
  service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)


In [11]:
def evaluate_response_time_and_accuracy(chunk_size):
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # Create vector index
    llm = OpenAI(model="gpt-3.5-turbo")
    service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size, chunk_overlap = 50)
    vector_index = VectorStoreIndex.from_documents(
        eval_documents, service_context=service_context
    )

    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing

        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [9]:
# Iterate over different chunk sizes to evaluate the metrics, used chunk overlap of 200 here
chunk_sizes = [256, 512, 1024, 2048]
for chunk_size in chunk_sizes:
    avg_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size)
    print(f"Chunk size {chunk_size} - Average Response time: {avg_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

  service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size)


Chunk size 256 - Average Response time: 1.02s, Average Faithfulness: 0.65, Average Relevancy: 0.70
Chunk size 512 - Average Response time: 1.16s, Average Faithfulness: 0.65, Average Relevancy: 0.65
Chunk size 1024 - Average Response time: 0.98s, Average Faithfulness: 0.65, Average Relevancy: 0.65
Chunk size 2048 - Average Response time: 1.17s, Average Faithfulness: 0.70, Average Relevancy: 0.65


In [12]:
# Iterate over different chunk sizes to evaluate the metrics, chunk overlap 50
chunk_sizes = [100, 150,200]
for chunk_size in chunk_sizes:
    avg_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size)
    print(f"Chunk size {chunk_size} - Average Response time: {avg_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

  service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size, chunk_overlap = 50)


Chunk size 100 - Average Response time: 0.79s, Average Faithfulness: 0.60, Average Relevancy: 0.60
Chunk size 150 - Average Response time: 0.86s, Average Faithfulness: 0.70, Average Relevancy: 0.65
Chunk size 200 - Average Response time: 0.98s, Average Faithfulness: 0.60, Average Relevancy: 0.65


Chunk size 256, with overlap of 200 provides the best score in terms of faithfulness and relevancy.