<a href="https://colab.research.google.com/github/rajdeepd/bpb-vector-databases/blob/main/chapter10/evaluate_llama_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
!pip install -q llama-index

In [32]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.llms.openai import OpenAI

import os
import pandas as pd

In [67]:
from google.colab import userdata
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
len(OPENAI_API_KEY)

164

# **Set OpenAI api key**

In [34]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# **use Paul Graham Essay text or the JIG and Fixture data from the below link or any document for building RAG pipeline.**

# **Download Data**

In [35]:
!mkdir -p 'data/paul_graham/'
!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -o 'data/paul_graham/paul_graham_essay.txt'

# jig book data link https://drive.google.com/file/d/1e6fezR9seps-X4IOTdxRgA86QGVoW6_d/view?usp=sharing

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    14  100    14    0     0     83      0 --:--:-- --:--:-- --:--:--    83


# **Load Data and Build Index.**

In [36]:
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
#documents = SimpleDirectoryReader("/content").load_data()


# Define an LLM
llm = OpenAI(model="gpt-4")

# Build index with a chunk_size of 512
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes)

# **Build a QueryEngine and start querying.**

In [37]:
query_engine = vector_index.as_query_engine()

# By default it retrieves two similar nodes/ chunks. You can modify that in
# vector_index.as_query_engine(similarity_top_k=k).


response_vector = query_engine.query("What is the  percentage of the tight part tolerance must be applied to the tool")
response_vector_1 = query_engine.query("What is duplicate locating jigs and fixtures design")

In [38]:
response_vector

Response(response='The percentage of the tight part tolerance that must be applied to the tool is typically around 10% to ensure proper fit and functionality.', source_nodes=[NodeWithScore(node=TextNode(id_='ea181f65-20df-465e-804f-1e3e0c83aa63', embedding=None, metadata={'file_path': '/content/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 14, 'creation_date': '2025-02-01', 'last_modified_date': '2025-02-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b32c8d96-0fb2-4f67-adc7-7c7fb5adb6c7', node_type='4', metadata={'file_path': '/content/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 't

**print and check the response**

In [39]:
response_vector.response

'The percentage of the tight part tolerance that must be applied to the tool is typically around 10% to ensure proper fit and functionality.'

In [40]:
response_vector_1.response

'Duplicate locating jigs and fixtures design refers to the process of creating tools and equipment that aid in accurately positioning and holding workpieces during manufacturing or assembly. These jigs and fixtures are designed to ensure consistency and precision in the production of multiple identical parts by providing a reliable means of locating and securing the workpiece in the correct position.'

In [41]:
print(response_vector_1.response)

Duplicate locating jigs and fixtures design refers to the process of creating tools and equipment that aid in accurately positioning and holding workpieces during manufacturing or assembly. These jigs and fixtures are designed to ensure consistency and precision in the production of multiple identical parts by providing a reliable means of locating and securing the workpiece in the correct position.


In [42]:
# Let's check the text in each of these retrieved nodes.
# First retrieved node
#response_vector.source_nodes[0].get_text()
response_vector

Response(response='The percentage of the tight part tolerance that must be applied to the tool is typically around 10% to ensure proper fit and functionality.', source_nodes=[NodeWithScore(node=TextNode(id_='ea181f65-20df-465e-804f-1e3e0c83aa63', embedding=None, metadata={'file_path': '/content/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 14, 'creation_date': '2025-02-01', 'last_modified_date': '2025-02-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b32c8d96-0fb2-4f67-adc7-7c7fb5adb6c7', node_type='4', metadata={'file_path': '/content/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 't

In [43]:
# Second retrieved node
len(response_vector.source_nodes)

1

# **Question-Context Pair Generation:**

In [44]:
qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm,
    num_questions_per_chunk=2
)

100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


# **Retrieval Evaluation:**

In [45]:
retriever = vector_index.as_retriever(similarity_top_k=2)

# **Mean Reciprocal Rank (MRR):**

In [46]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

In [47]:
# Evaluate
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [48]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

In [49]:
display_results("OpenAI Embedding Retriever", eval_results)

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,OpenAI Embedding Retriever,1.0,1.0


# **Response Evaluation**

In [50]:
# Get the list of queries from the above created dataset

queries = list(qa_dataset.queries.values())

In [51]:
queries

["What does the error code '404' typically represent in computer networking?",
 "Can you explain some potential reasons why a 'Not Found' error might occur?"]

# **Faithfulness Evaluator**

In [52]:
# gpt-3.5-turbo
#gpt35 = OpenAI(temperature=0, model="gpt-3.5-turbo")
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
#Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
#Settings.num_output = 512
#Settings.context_window = 3900
##service_context_gpt35 = ServiceContext.from_defaults(llm=gpt35)

# gpt-3.5-turbo-16k-0613
gpt4 = OpenAI(temperature=0, model="gpt-4")
#service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)

#gpt-4
#gpt35T = OpenAI(temperature=0, model="gpt-4")
#service_context_gpt4 = ServiceContext.from_defaults(llm=gpt35T)

In [53]:
#Create a QueryEngine with gpt-3.5-turbo service_context to generate response for the query.
#vector_index = VectorStoreIndex(nodes, service_context = service_context_gpt35)
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
#query_engine = VectorStoreIndex.as_query_engine()

In [54]:
#Create a FaithfulnessEvaluator
from llama_index.core.evaluation import FaithfulnessEvaluator
#faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
faithfulness_gpt4  = FaithfulnessEvaluator(llm=gpt4)


In [55]:
eval_query = queries[1]

eval_query

"Can you explain some potential reasons why a 'Not Found' error might occur?"

In [56]:
#Generate response first and use faithfull evaluator.
response_vector = query_engine.query(eval_query)

In [57]:
# Compute faithfulness evaluation

eval_result = faithfulness_gpt4.evaluate_response(response=response_vector)

In [None]:
# You can check passing parameter in eval_result if it passed the evaluation.
eval_result.passing

True

# **Relevancy Evaluator**

In [59]:
from llama_index.core.evaluation import RelevancyEvaluator

#relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)
relevancy_gpt4 = RelevancyEvaluator(llm=gpt4)

In [60]:
# Pick a query
query = queries[1]

query

"Can you explain some potential reasons why a 'Not Found' error might occur?"

In [61]:
# Generate response.
# response_vector has response and source nodes (retrieved context)
response_vector = query_engine.query(query)

# Relevancy evaluation
eval_result = relevancy_gpt4.evaluate_response(
    query=query, response=response_vector
)

In [62]:
# You can check passing parameter in eval_result if it passed the evaluation.
eval_result.passing

True

In [63]:
# You can get the feedback for the evaluation.
eval_result.feedback

'YES'

# **Batch Evaluator**

In [64]:
from llama_index.core.evaluation import BatchEvalRunner

# Let's pick top 10 queries to do evaluation
batch_eval_queries = queries[:10]

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=8,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

In [65]:
# Let's get faithfulness score

faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])

faithfulness_score

0.5

In [66]:
# Let's get relevancy score

relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

relevancy_score


1.0