In [1]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.vector_stores import DeepLakeVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index import VectorStoreIndex
from llama_index.evaluation import FaithfulnessEvaluator

In [2]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0)

In [3]:
# build service context
service_context = ServiceContext.from_defaults(llm=llm)

In [4]:
vector_store = DeepLakeVectorStore(dataset_path="hub://srishtysuman2919/optimization_srishty", overwrite=False)

Deep Lake Dataset in hub://srishtysuman2919/optimization_srishty already exists, loading from the storage


In [5]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [6]:
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

In [7]:
# define evaluator
evaluator = FaithfulnessEvaluator(service_context=service_context)

In [8]:
# query index
query_engine = index.as_query_engine()

In [10]:
response = query_engine.query("What does Paul Graham do?")

In [11]:
eval_result = evaluator.evaluate_response(response=response)

In [12]:
print( "> response:", response )

print( "> evaluator result:", eval_result.passing )

> response: Paul Graham is involved in startup funding and mentoring. He created a program called the Summer Founders Program, where he invited undergraduates to apply for funding and mentorship for their startup ideas. He selected a group of founders to fund and provided them with financial support and resources. Additionally, he organized talks by experts on startups and used his essays as a way to attract potential founders for his program.
> evaluator result: True


    MRR: measures the retrieval system's ability to return the best result as high up in the ranking as possible.

    Hit Rate: evaluates the presence of relevant items within the top results returned.

    MAP (Mean Average Precision): measure of ranking quality across multiple queries. MAP calculates the mean of the average precisions for each query, where the average precision is computed as the mean of the precision scores after each relevant document is retrieved.

    NDCG (Normalized Discounted Cumulative Gain): Evaluates the ranking of documents based on their relevance, giving more importance to relevant documents that appear higher in the ranking. It is normalized so that the perfect ranking's score is 1, allowing for comparison across different sets of queries.

Golden Context Dataset

    Consist of carefully selected queries paired with an ideally matched set of sources that contain the answers.

    To create a Golden Dataset, gather a set of realistic customer questions and pair them with expert answers, then use this dataset to compare against responses from a language model for quality assurance, ensuring the LLM's answers align closely with the expert ones for accuracy and relevance.

    Once the golden dataset is ready, the next step is to use it to measure the quality of LLM responses.  After each evaluation, metrics like the following will be available to quantify the user experience.

Community-Based Evaluation Tools

    Ragas: Another key tool that provides a framework for evaluating and integrating with LlamaIndex, offering detailed metrics.
    
    DeepEval: A tool designed for in-depth evaluation, facilitating comprehensive assessments of various aspects of the system.

In [13]:
! pip install html2text ragas

Collecting ragas
  Downloading ragas-0.1.0-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain-openai (from ragas)
  Downloading langchain_openai-0.0.5-py3-none-any.whl.metadata (2.5 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m812.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Downloading ragas-0.1.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m690.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading langchain_openai-0.0.5-py3-none-any.whl (29 kB)
Installing collected packages: appdirs, pysbd, langchain-openai, ragas
Successfully installed appdirs-1.4.4 langchain-openai-0.0.5 pysbd-0.3.4 ragas-0.1.0


In [14]:
from llama_index.readers.web import SimpleWebPageReader
from llama_index import VectorStoreIndex, ServiceContext

documents = SimpleWebPageReader(html_to_text=True).load_data( ["https://en.wikipedia.org/wiki/New_York_City"] )

vector_index = VectorStoreIndex.from_documents(documents, service_context=ServiceContext.from_defaults(chunk_size=512))

query_engine = vector_index.as_query_engine()

response_vector = query_engine.query("How did New York City get its name?")

print(response_vector)

New York City got its name when King Charles II of England granted the lands to his brother, the Duke of York. The city was renamed New York in honor of the Duke of York.


In [15]:
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
]

eval_answers = [
    "8,804,000",  # incorrect answer
    "Queens",  # incorrect answer
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
]

eval_answers = [[a] for a in eval_answers]

In [16]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from ragas.evaluation import evaluate

result = evaluate(query_engine, metrics, eval_questions, eval_answers)

# print the final scores
print(result)

    The Custom RAG Pipeline Evaluation

In [19]:
! wget 'https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt'

--2024-02-10 10:50:49--  https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19241 (19K) [text/plain]
Saving to: ‘venus_transmission.txt’


2024-02-10 10:50:50 (29.6 MB/s) - ‘venus_transmission.txt’ saved [19241/19241]



In [22]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["venus_transmission.txt"])

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 1 docs


In [23]:
from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex

# Build index with a chunk_size of 512
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(docs)
vector_index = VectorStoreIndex(nodes)

In [24]:
query_engine = vector_index.as_query_engine()

response_vector = query_engine.query("What was The first beings to inhabit the planet?")
print( response_vector.response )

The first beings to inhabit the planet were a dinoid and reptoid race from two different systems outside our solar system.


In [25]:
# First retrieved node
response_vector.source_nodes[0].get_text()

"They had heard of this beautiful new planet. At this time, Earth had two moons to harmonize the weather conditions and control the tides of the large bodies of water.\nThe first beings to inhabit the planet were a dinoid and reptoid race from two different systems outside our solar system. They were intelligent and walked on two legs like humans and were war-like considering themselves to be superior to all other life forms. In the past, the four races of humans had conflicts with them before they outgrew such behavior. They arrived on Earth to rob it of its minerals and valuable gems. Soon they had created a terrible war. They were joined by re-\n1\nenforcements from their home planets. One set up its base on one of the Earth's moons, the other on Earth. It was a terrible war with advanced nuclear and laser weapons like you see in your science fiction movies. It lasted very long. Most of the life forms lay in singed waste and the one moon was destroyed. No longer interested in Earth,

In [26]:
# Second retrieved node
response_vector.source_nodes[1].get_text()

"They had mastered the concepts of creativity with the energy of their thoughts. This comes with the responsibility to use it for positive and constructive reasons that are beneficial and harmonic or for selfish reasons, such as gaining power and wealth.\nThey had complete understanding of creation and the creator. They knew themselves to be Soul and eternal, not restricted to the physical body. Death to them did not exist. For them it was a transition from one existence to another. They chose their own life spans according to the experiences they wished to have or missions they wished to complete.\nThey came to this solar system to bring human life here and to be the protectors of all life on all planets. The yellow race colonized the planet you know as Mars, the red race the planet Saturn, the black race Jupiter and the white race Venus.\nEarth was then only a comet flying around as it had not taken the form of a planet or settled into an orbit around the Sun. When the Earth finally 

In [27]:
from llama_index.llms import OpenAI
from llama_index.evaluation import generate_question_context_pairs

# Define an LLM
llm = OpenAI(model="gpt-3.5-turbo")

qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm,
    num_questions_per_chunk=2
)

queries = list(qa_dataset.queries.values())
print( queries[0:10] )

100%|██████████| 13/13 [03:30<00:00, 16.20s/it]

['How did the beings described in the context communicate with different life forms and dimensions? How did their telepathic abilities and technology contribute to their understanding of creation and the creator?', 'Describe the role of different races in the colonization of planets within the solar system according to the information provided. How did Earth differ from other planets during that time period?', 'Explain the concept of creativity as understood by the advanced races mentioned in the context. How did they use their creative abilities and what were the responsibilities associated with it?', 'Describe the initial state of Earth before it became a planet and settled into an orbit around the Sun. How did the four races contribute to the development of life on Earth and what role did they play as protectors of all life on all planets?', 'How did the arrival of the dinoid and reptoid races on Earth lead to a devastating war? Discuss the reasons behind their conflict with the fou




In [None]:
from llama_index.evaluation import RetrieverEvaluator
import pandas as pd

retriever = vector_index.as_retriever(similarity_top_k=2)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

# Evaluate
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df


display_results("OpenAI Embedding Retriever", eval_results)

In [None]:
# gpt-3.5-turbo
gpt35 = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context_gpt35 = ServiceContext.from_defaults(llm=gpt35)

# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-3.5-turbo-1106")
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)

vector_index = VectorStoreIndex(nodes, service_context = service_context_gpt35)
query_engine = vector_index.as_query_engine()

eval_query = queries[10]
response_vector = query_engine.query(eval_query)

print( "> eval_query: ", eval_query )
print( "> response_vector:", response_vector )

In [None]:
from llama_index.evaluation import RelevancyEvaluator
from llama_index.evaluation import FaithfulnessEvaluator

relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)
faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)

# Compute faithfulness evaluation

eval_result = faithfulness_gpt4.evaluate_response(response=response_vector)
# check passing parameter in eval_result if it passed the evaluation.
print( eval_result.passing )

# Relevancy evaluation
eval_result = relevancy_gpt4.evaluate_response(
    query=eval_query, response=response_vector
)
# You can check passing parameter in eval_result if it passed the evaluation.
print( eval_result.passing )

    BatchEvalRunner: uns the evaluation process in batches and concurrently

In [None]:
#Batch Evaluator:
#BatchEvalRunner to compute multiple evaluations in batch wise manner.

from llama_index.evaluation import BatchEvalRunner

# Let's pick top 10 queries to do evaluation
batch_eval_queries = queries[:10]

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=8,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

# get faithfulness score
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
# get relevancy score
relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])

print( "> faithfulness_score", faithfulness_score )
print( "> relevancy_score", relevancy_score )