In [1]:
# This notebook uses the Google Gemini API (free tier, local API key) 
# and the RAGAS evaluation library to evaluate several metrics for a RAG pipeline
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# RAGAS: https://docs.ragas.io/en/stable/
# Note: This notebook is with RAGAS 2-12 (last updated: 01/12/25) and Python 3.11.8

# Note for earlier RAGAS versions; did not need this for RAGAS 2-12: 
# I had to edit underlying RAGAS library (cloned locally, edited files, then pip -e installed locally) for this issue re: temperature with Gemini:
# https://github.com/explodinggradients/ragas/pull/657/files
# https://github.com/explodinggradients/ragas/issues/678
# Edits simply remove the temperature variable, see notes.txt for more specific info

In [2]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.

# Faithfulness with HHEM - Similar to Faithfulness but uses a HuggingFace model (Vectara's HHEM 2.1 classifier) to detect hallucinations
# https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html#faithfullness-with-hhem-2-1-model
# https://huggingface.co/vectara/hallucination_evaluation_model

# RAGAS has other metrics as well : https://docs.ragas.io/en/latest/concepts/metrics/index.html

In [1]:
# set do not track variable to RAGAS
# more info: https://github.com/explodinggradients/ragas/issues/49
import os
os.environ["RAGAS_DO_NOT_TRACK"] = "True"

In [2]:
import logging
import sys
import google.generativeai as genai
import textwrap
import ast
import time
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss
import ragas
from ragas.testset import TestsetGenerator
#from ragas.testset.evolution import simple, reasoning, multi_context
from ragas.run_config import RunConfig
#from ragas.metrics import (
#    answer_relevancy,
#    faithfulness,
#    context_recall,
#    context_precision,
#)
from ragas.metrics import (
    Faithfulness,
    LLMContextPrecisionWithoutReference, # most similar to previous RAGAS version of Context Precision metric
    LLMContextPrecisionWithReference,
    NonLLMContextPrecisionWithReference
)
from ragas import SingleTurnSample, EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from datasets import Dataset

from IPython.display import display
from IPython.display import Markdown

In [3]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [5]:
ragas._analytics.do_not_track()

True

In [13]:
# Establish RAG pipeline with Gemini

In [7]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# # Example of creating a small vector store
# # Using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, edited to include a title with the date of the speech
# # Example from 2024:
# # https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/

# # load and parse files
# sotu = []
# newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
# for i in newfiles:
#     with open(i) as file:
#         for line in file:
#             nl = line.rstrip()
#             if nl != '':
#                 sotu.append(nl)

# # Convert into Document format for Faiss
# documents = [Document(text=line) for line in sotu]

In [9]:
# # Example of a loaded Document line
# documents[-1]

Document(id_='235d1f3b-a216-412c-8459-51d27c73c8d0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [8]:
# # Set up the Faiss index
# d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
# faiss_index = faiss.IndexFlatL2(d)
# print(faiss_index.is_trained) # double check that the training worked

True


In [6]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # Replace with your embeddings model
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [7]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [15]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, show_progress=True
# )

# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# index.index_id

In [8]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the 4 speeches including a title that includes the date it was given
# Update: As of 2/4, the speeches index has somehow been deleted
# My local index id '95634851-570e-454e-983f-6634eeb72aee' contains 3200 documents from the rag_mini_wikipedia dataset
index = load_index_from_storage(storage_context=storage_context, index_id='95634851-570e-454e-983f-6634eeb72aee')

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading indices with ids: ['95634851-570e-454e-983f-6634eeb72aee']
Loading indices with ids: ['95634851-570e-454e-983f-6634eeb72aee']


In [11]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
# query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7faf185372d0>

In [None]:
# # Example query and response with Gemini and query_engine
# query = "What has the President done related to healthcare?"
# response = query_engine.query(query) 
# print(response.response)

In [None]:
# # Get ranked scores for top k RAG source nodes
# for node in response.source_nodes:
#     print(f"{node.get_score()} -> {node.text}")

In [22]:
# Example of how to embed a sentence with Gemini
#result = genai.embed_content(
#    model="models/text-embedding-004",
#    content="What is the meaning of life?",
#    task_type="retrieval_document",
#    title="Embedding of single string")

In [13]:
# # Example of using the chat engine with our index
# query = "You are an expert speech analyst and specialize in analyzing Presidential State of the Union speeches. Could you please analyze the speeches and generate 2 questions and answers from each speech, providing the document filename of each speech that relates to each question?"
# response = chat_engine.chat(query) 
# print(response.response)

In [14]:
# # View chat history
# chat_engine.chat_history

In [17]:
# Code for RAGAS evaluation library to work with Gemini and our local RAG setup

In [13]:
# testing code to have Gemini work with RAGAS
from langchain_core.outputs import LLMResult

def custom_is_finished_parser(response: LLMResult):
    is_finished_list = []
    for g in response.flatten():
        resp = g.generations[0][0]
        if resp.generation_info is not None:
            # generation_info is provided - so we parse that

            # Gemini uses "STOP" to indicate that the generation is finished
            # and is stored in 'finish_reason' key in generation_info
            if resp.generation_info.get("finish_reason") is not None:
                is_finished_list.append(
                    resp.generation_info.get("finish_reason") == "STOP"
                )

        # if generation_info is empty, we parse the response_metadata
        # this is less reliable
        elif (
            isinstance(resp, ChatGeneration)
            and t.cast(ChatGeneration, resp).message is not None
        ):
            resp_message: BaseMessage = t.cast(ChatGeneration, resp).message
            if resp_message.response_metadata.get("finish_reason") is not None:
                is_finished_list.append(
                    resp_message.response_metadata.get("finish_reason") == "STOP"
                )
        # default to True
        else:
            is_finished_list.append(True)
    return all(is_finished_list)

In [18]:
# Example of generating synthetic dataset with RAGAS

# In a synthetic dataset, columns generated are 'question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', and 'episode_done'
# Ground truth is supposed to be the 'human' level answer vs the RAG answer

# Notes: 
# - We have to generate the answer separately with our RAG, which then generates new context used.
# - I use the context that was used to generate the answer for the metrics calculation, while still saving the old contexts column.
# - The best thing to do would be to generate the answer when creating the synthetic test dataset, but this is not available in RAGAS.
# - From a Github issue: Since you use the same LLM to generate your synthetic dataset ground_truth and your answer, 
# - it is possible the results of the RAG evaluation might be biased. This has not been studied.

In [41]:
# Load documents for use in generating synthetic dataset with RAGAS

#corpus = pd.read_csv("datasets/rag_mini_wikipedia_corpus.csv", index_col=['id'])
#documents = [Document(text=passage[0], doc_id=str(i)) for i, passage in corpus.iterrows()] 
loader = DirectoryLoader("./Speeches/titleedits") # Loads all documents in the directory; there are parameters for ignoring or matching certain files
documents = loader.load()



In [None]:
documents[1]

In [43]:
# Need to add 'filename' metadata for RAGAS to process documents
for document in documents:
    #document.metadata['filename'] = document.id_
    document.metadata['filename'] = document.metadata['source']

In [44]:
# Synthetic testset generator with Gemini models
generator_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", timeout=240) # Other notable parameters: temperature=0.7, transport="rest"
ragas_llm = LangchainLLMWrapper(
    generator_llm,
    is_finished_parser=custom_is_finished_parser,
)
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", request_options={"timeout": 240}) 

generator = TestsetGenerator(llm=ragas_llm, embedding_model=embeddings)

In [45]:
# Increase the timeout settings with RAGAS's RunConfig class

# Note: For Gemini, the RAGAS internal RunConfig settings do a decent job at limiting the 429 resource exhausted warnings
# (max_workers=1 still can send more requests to Gemini than the 15 requests per minute it allows)
# Still very difficult to have the testset generation run successfully with Gemini free tier
# I also tried the ratelimit and backoff libraries in Python, but I still got so many 429 warnings that the generation failed
# Sometimes even the 1 max worker will not finish, but it will finish occasionally

run_config = RunConfig(timeout=300, max_retries=20, max_wait=300, max_workers=1)

In [46]:
# # Optional- edit the distribution of query types desired for the testset generation
# from ragas.testset.synthesizers import default_query_distribution
# query_distribution = default_query_distribution(ragas_llm)

In [None]:
# The RAGAS internal RunConfig settings do a decent job at limiting the 429 resource exhausted warnings when max_workers=1
# Still receive many, many 429 errors
gendataset = generator.generate_with_langchain_docs(documents, testset_size=10, run_config=run_config, ) # stopped here

In [49]:
# # Save generated testset to csv
gendataset_pd = gendataset.to_pandas()
gendataset_pd.to_csv("results/test_ragas_generation_0_2_12_sotu.csv", index=False)
gendataset_pd

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What did President Biden say about President Z...,[State of the Union Address given by President...,"President Biden said, ""From President Zelensky...",single_hop_specifc_query_synthesizer
1,"How is America standing with Ukraine, and what...",[60 Million barrels of oil from reserves aroun...,"America will lead an effort, releasing 30 Mill...",single_hop_specifc_query_synthesizer
2,How is the term 'Amerrican' used in the contex...,[buy American to make sure everything from the...,The text discusses buying 'American' to ensure...,single_hop_specifc_query_synthesizer
3,What specific actions are being taken to suppo...,[shouldn’t have to pay more than 7% of their i...,The plan includes increasing Pell Grants and i...,single_hop_specifc_query_synthesizer
4,Given President Biden's State of the Union Add...,[<1-hop>\n\nState of the Union Address given b...,"In his State of the Union Address on April 29,...",multi_hop_abstract_query_synthesizer
5,How iz the US addressing gas prices in light o...,[<1-hop>\n\nState of the Union Address given b...,"In response to Russia's invasion of Ukraine, t...",multi_hop_abstract_query_synthesizer
6,Given President Biden's State of the Union Add...,[<1-hop>\n\nState of the Union Address given b...,"In his State of the Union Address on April 29,...",multi_hop_abstract_query_synthesizer
7,Given President Biden's focus on climate chang...,[<1-hop>\n\nState of the Union Address given b...,"During his State of the Union Address, Preside...",multi_hop_abstract_query_synthesizer
8,Considering President Biden's State of the Uni...,[<1-hop>\n\nState of the Union Address given b...,"In his State of the Union address on March 1, ...",multi_hop_specific_query_synthesizer
9,Considering the historical significance of hav...,[<1-hop>\n\nState of the Union Address given b...,"In his 2021 State of the Union address, Presid...",multi_hop_specific_query_synthesizer


In [23]:
# # Now generate answers for the testset, as answers are not automatically generated at creation
# # Below code uses a previously generated testset

# testset_pd = pd.read_csv("datasets/testset_flash_pro15.csv", index_col = None)

# Note: When saving, the 'contexts' column is saved as a string but needs to be a list
# If you are importing testset_pd from a csv file, use the below code to change the column to a list

# testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [None]:
# Generate answers using our query engine & Faiss vector database
# Alternatively can use the chat_engine if memory between queries is needed (i.e., queries reference previous queries)
query_engine = index.as_query_engine(similarity_top_k=10)
answers = [query_engine.query(q) for q in testset_pd['question']]

In [25]:
# Parse out new 'answer' and 'contexts' columns
answers_new = []
context_new = []
for i in answers:
    answers_new.append(i.response)
    context_new.append([c.node.get_content() for c in i.source_nodes])

testset_pd = testset_pd.rename(columns={"contexts":"contexts_gt"}) # Keeping old contexts that were used for testset/query generation (gt = ground truth)
testset_pd['contexts'] = context_new
testset_pd['answer'] = answers_new

# Save complete synthetically created dataset/testset
# testset_pd.to_csv('datasets/ragas_full_testset_flash_pro15.csv', index=False)

In [34]:
# Evaluate a dataset with RAGAS

In [14]:
# Read in dataset for evaluation
testset_pd = pd.read_csv("datasets/rag_mini_wikipedia_complete_chat_ares.csv", index_col = None) 

# RAGAS expects the following columns for evaluation (rename in dataset as needed) : "user_input", "retrieved_contexts", "response", "reference"
testset_pd = testset_pd.rename(columns={"question": "user_input", "answer": "response", "ground_truth": "reference", "contexts": "retrieved_contexts"})

# Note: When saving a synthetic dataset, the 'contexts' column is saved as a string but needs to be a list for evaluation
# If you are importing testset_pd from a csv file that is a synthetic dataset, use the below code to change the column to a list
# This may apply to other datasets as well
testset_pd['retrieved_contexts'] = testset_pd['retrieved_contexts'].apply(ast.literal_eval)

In [15]:
testset_pd

Unnamed: 0,id,user_input,reference,retrieved_contexts,response
0,0,Was Abraham Lincoln the sixteenth President of...,yes,"[Abraham Lincoln (February 12, 1809 â April ...","Yes, Abraham Lincoln was the sixteenth Preside..."
1,2,Did Lincoln sign the National Banking Act of 1...,yes,[Lincoln believed in the Whig theory of the pr...,"Yes, the provided text states that Lincoln sig..."
2,4,Did his mother die of pneumonia?,no,"[Soon thereafter, Tesla hastened from Paris to...",The provided text doesn't contain information ...
3,6,How many long was Lincoln's formal education?,18 months,[Lincoln's formal education consisted of about...,The text states that Lincoln's formal educatio...
4,8,When did Lincoln begin his political career?,1832,"[Lincoln began his political career in 1832, a...","Lincoln began his political career in 1832, at..."
...,...,...,...,...,...
898,1686,Was Wilson a member of the Phi Kappa Psi frate...,yes,[* Wilson was a member of the Phi Kappa Psi fr...,"Yes, Woodrow Wilson was a member of the Phi Ka..."
899,1688,Was Wilson an automobile enthusiast?,yes,"[Wilson's Pierce Arrow, which resides in his h...",While not an *avid* enthusiast in the sense of...
900,1690,Did Wilson's father own slaves?,yes,"[Thomas Woodrow Wilson was born in Staunton, V...","Yes, Woodrow Wilson's father, Joseph Ruggles W..."
901,1692,Where is Wilson buried?,He was buried in Washington National Cathedral,[* Wilson is the only U.S. President buried in...,Woodrow Wilson is buried in Washington Nationa...


In [16]:
# only dropping this column for an equal comparison run to the previous RAGAS version, whose context precision metric did not use the "reference" column
# testset_pd_noref = testset_pd.drop("reference", axis=1)
# testset_pd_noref

In [17]:
# The EvaluationDataset format takes input in the HuggingFace Datasets format
# Note: I am also dropping the id column here
testset_ds = Dataset.from_pandas(testset_pd.drop("id", axis=1))
testset_ds

Dataset({
    features: ['user_input', 'reference', 'retrieved_contexts', 'response'],
    num_rows: 903
})

In [18]:
eval_dataset = EvaluationDataset.from_hf_dataset(testset_ds)

In [19]:
eval_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=903)

In [20]:
# Note: I'm using the normal LLM, not the RAG context-loaded query engine
# There is code at the bottom of the notebook for using the query engine, but it appears to just use the query engine to develop
# new answers and contexts and then to use the non-RAG LLM for the metrics evaluation
# That code also appears to be broken from RAGAS right now, so I was forced to use the regular Gemini LLM anyway

# Note: The RAGAS evaluate function (below) may re-run the query and give new answers and contexts
# See this issue: https://github.com/explodinggradients/ragas/issues/1211
# In testing, the results still output the same answers and contexts as I started with, so I'm not concerned by this

generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=300) # Other notable parameters: temperature=0.7, transport="rest"
ragas_llm = LangchainLLMWrapper(
    generator_llm,
    is_finished_parser=custom_is_finished_parser,
)
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", request_options={"timeout": 300}) 

#embeddings_wrapper = LangchainEmbeddingsWrapper(embeddings)

In [21]:
# Increase the timeout settings
run_config = RunConfig(timeout=3000, max_wait=3000, max_workers=1, max_retries=10)

In [26]:
# Two coding options for running evaluate:
# 1) Bulk run with the evaluate function, as intended. 
# Unfortunately rate-limiting does not work well with this, 
# and 2/3 of my 800 example dataset received NaN results because of rate limiting issues.
# Ex: Evaluating 1 example for 1 metric resulted in 10 API calls.

# 2) Run the evaluation in small batches
# This allowed me to finish the evaluation of the entire dataset without rate limiting errors.

#cp new below
#from ragas import evaluate

#results = evaluate(eval_dataset, metrics=[metric])

# Bulk evaluation of the dataset
evalresult = evaluate(
    dataset = eval_dataset,
    metrics = [
        context_precision
        faithfulness,
        answer_relevancy,
        context_recall
    ],
    llm = ragas_llm,
    embeddings=embeddings,
    run_config=run_config
)

# Optional parameter: in_ci: bool, Whether the evaluation is running in CI or not. 
# If set to True then some metrics will be run to increase the reproducability of the evaluations. 
# This will increase the runtime and cost of evaluations. Default is False.
# In practice, setting in_ci = True resulted in a lot of timeouts / no score calculated / NaN

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2456070787.py, line 19)

In [22]:
# Setup for smaller batches
testset_results = pd.DataFrame()

In [26]:
# Iterate through dataset for smaller batches to be evaluated
context_precision_ref = LLMContextPrecisionWithReference(llm=ragas_llm) # most similar to DeepEval's Context Precision
context_precision_noref = LLMContextPrecisionWithoutReference(llm=ragas_llm)
context_precision_ref_nollm = NonLLMContextPrecisionWithReference()
faithfulness_score = Faithfulness(llm=ragas_llm)
batchsize = 1
for i in range(389,392,batchsize):   # stopped at 389, 388 was last fully executed
    tempdataset = EvaluationDataset(samples=eval_dataset.samples[i:i+batchsize])
    print(i)
    evalresult = evaluate(
        metrics = [
            context_precision_ref
        ],
        dataset = tempdataset,
        llm = ragas_llm,
        embeddings=embeddings,
        run_config=run_config
    )
    testset_results = pd.concat([testset_results, evalresult.to_pandas()])
    time.sleep(65) # RAGAS generates ~2 API calls per example for faithfulness.
# RAGAS generates ~10 API calls per example for context precision. Gemini free-tier limit is 15RPM, so we need to wait a minute between calls for the limit to reset.

342


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

343


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

344


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

345


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

346


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

347


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

348


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

349


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

350


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

351


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

352


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

353


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

354


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

355


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

356


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

357


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

358


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

359


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

360


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

361


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

362


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

363


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

364


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

365


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

366


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

367


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

368


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

369


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

370


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

371


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

372


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

373


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

374


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

375


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

376


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

377


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

378


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

379


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

380


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

381


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

382


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

383


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

384


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

385


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

386


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

387


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

388


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


389


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with

390


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with

391


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with

In [31]:
testset_results

Unnamed: 0,user_input,retrieved_contexts,response,reference,llm_context_precision_with_reference
0,Does a large proportion of the population spea...,"[Finland has a population of 5,300,362 people,...","No, while Swedish is an official language, onl...",No,0.833333
0,Do women live longer than men?,[The life expectancy is 82 years for women and...,"Yes, the provided text indicates that women in...",Yes,1.000000
0,What is a country with which Finland is involv...,"[Finland, officially the Republic of Finland ""...",The text states that Finland is not involved i...,Finland is not involved in international confl...,0.908532
0,Was Ford a member of the House of Representati...,[Ford was a member of the House of Representat...,"Yes, Gerald Ford was a member of the House of ...",Yes,0.966667
0,For how long was Ford a member of the House of...,[Ford was a member of the House of Representat...,Gerald Ford was a member of the House of Repre...,Over eight years.,0.500000
...,...,...,...,...,...
0,In which years were John Monroe elected as Pre...,"[Following the War of 1812, Monroe was elected...","James Monroe, not John Monroe, was elected Pre...",1817-1825,0.500000
0,Who was John Monroe standing behind in the pai...,"[The Presidentâs parents, father Spence Monr...","There's no widely known painting of ""Washingto...",George Washington,0.000000
0,When was James Monroe appointed to Secretary o...,"[James Monroe (April 28, 1758 â July 4, 1831...",James Monroe was appointed Secretary of War in...,1814,
0,When did James Monroe die?,"[Upon Elizabeth's death in 1830, Monroe moved ...","James Monroe died on July 4, 1831.\n","July 4, 1831",


In [30]:
testset_results[:147]['llm_context_precision_with_reference'].isna().sum()

0

In [None]:
testset_results

In [48]:
# to do later, 1
testset_results.iloc[21]['llm_context_precision_with_reference'] = testset_results.iloc[80]['llm_context_precision_with_reference']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testset_results.iloc[21]['llm_context_precision_with_reference'] = testset_results.iloc[80]['llm_context_precision_with_reference']


In [49]:
testset_results.iloc[21]

user_input                                                       What is named after him?
retrieved_contexts                      [The Celsius crater on the Moon is named after...
response                                The Celsius temperature scale is named after A...
reference                                                  The Celsius crater on the Moon
llm_context_precision_with_reference                                                  NaN
Name: 0, dtype: object

In [30]:
testset_results['llm_context_precision_with_reference'].isnull().sum()

2

In [23]:
testset_pd.iloc[20]

id                                                                   41
user_input            Was Lincoln chosen as a presidential candidate...
reference                                                           Yes
retrieved_contexts    [On November 6, 1860, Lincoln was elected as t...
response              Yes, Abraham Lincoln was chosen as the Republi...
Name: 20, dtype: object

In [32]:
# Recommend checking contexts column to be sure context nodes are separated from each other after saving
testset_results[:147].to_csv("results/results_ragas_2_12_gemini_1_5_rag_mini_wiki_complete_chat_cpwithref_242_388.csv", index=False)

In [51]:
# testing new method
from ragas import SingleTurnSample
from ragas.metrics import LLMContextPrecisionWithoutReference
from ragas.metrics import Faithfulness

context_precision_ref = LLMContextPrecisionWithReference(llm=ragas_llm)

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    reference="Paris",
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
    response="The Eiffel Tower is in Paris."
)

await context_precision_ref.single_turn_ascore(sample, timeout=70)

0.9999999999

In [17]:
from ragas.metrics import Faithfulness
faithfulness_score = Faithfulness(llm=ragas_llm)

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    reference="Paris",
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
    response="The Eiffel Tower is in Paris."
)

await faithfulness_score.single_turn_ascore(sample, timeout=70)
# stopped here

1.0

In [None]:
from ragas.metrics import FaithfulnesswithHHEM
faithfulnesshhem_score = FaithfulnesswithHHEM(llm=ragas_llm)

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    reference="Paris",
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
    response="The Eiffel Tower is in Paris."
)
await faithfulnesshhem_score.single_turn_ascore(sample)

In [43]:
evalresult

{'context_precision': 0.6944, 'faithfulness': 1.0000, 'answer_relevancy': 0.6770, 'context_recall': 0.6455}

In [61]:
# Example result:
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
# Reran
# {'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}
evalresult

{'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

In [None]:
# Note: received warning for the answer where there was no response from the llm, definitely reduced faithfulness score

# Results:
# Using contexts generated when produced answers from LLM:
# new testset_answer_newcontext_flash_pro15.csv result, with contexts_gt (aka contexts generated with ground truth generation) column removed
# {'context_precision': 0.4171, 'faithfulness': 0.9167, 'answer_relevancy': 0.6509, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

# Compared to using contexts generated for ground truth (probably not correct):
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}

In [None]:
# Evaluation results on metrics:

# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics
# I don't have example ranges to compare anything to, so below is my best guess.

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# 0.9167 - 1.0000 indicates that the LLM is staying true to the facts provided in the context for answering the question.
# There is another Faithfulness metric: from ragas.metrics import FaithulnesswithHHEM
# This uses a huggingface model to help detect hallucination : https://huggingface.co/vectara/hallucination_evaluation_model
# See below for code : {'faithfulness_with_hhem': 0.6319} 
# This doesn't really agree with the RAGAS faithfulness score... may need to dive in further another time.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# At 0.4171 - 0.5979, suggests that the context isn't particularly relevant to the question.
# Answer_relevancy - Measures how relevant the answer is to the question.
# 0.6509 - 0.6533 seems moderately low, just going off of the number.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.
# 0.8 indicates that the llm context is decently good and can typically answer the question or most of it. 

In [105]:
# Test run, just compare to using contexts_gt column instead of the newer context generated with the answer
testset_ds_oldcontext = Dataset.from_pandas(testset_pd.drop("contexts", axis=1).rename(columns={'contexts_old':'contexts'}))

In [None]:
evalresult_old2 = evaluate(
    testset_ds_oldcontext,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [None]:
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}
evalresult_old2

In [84]:
# Extra non-working code:

In [85]:
# Code to use the query_engine in the evaluation 
# Modeled after this tutorial: https://docs.ragas.io/en/latest/howtos/applications/compare_llms.html

# Does not currently work: for some metrics, it is not finding the 'ground_truth' column in the dataset
# For other metrics, appears to run but returns the below errors and returns 'nan' for results

In [64]:
# start of testing to try and get rag query engine for evaluate
def generate_responses(query_engine, test_questions, test_answers):
  responses = [query_engine.query(q) for q in test_questions]

  answers = []
  contexts = []
  for r in responses:
    answers.append(r.response)
    contexts.append([c.node.get_content() for c in r.source_nodes])
  dataset_dict = {
        "question": test_questions,
        "answer": answers,
        "contexts": contexts,
  }
  if test_answers is not None:
    dataset_dict["ground_truth"] = test_answers
  ds = Dataset.from_dict(dataset_dict)
  return ds

test_questions = testset_pd['question'].values.tolist()
test_answers = [[item] for item in testset_pd['answer'].values.tolist()]

result_ds = generate_responses(query_engine, test_questions, test_answers)

In [None]:
# Note: This evaluate function that uses the query_engine does not return results (nan for all metrics)
# Errors (below are repeated many times):
# WARNING:ragas.llms.base:n values greater than 1 not support for LlamaIndex LLMs
# n values greater than 1 not support for LlamaIndex LLMs
# INFO:ragas.llms.base:callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
# Exception raised in Job[5]: TimeoutError()
# ERROR:ragas.executor:Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')
# Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')

from ragas.integrations.llama_index import evaluate

eval_qe2 = evaluate(
    query_engine=query_engine,
    dataset=result_ds,
    metrics=[faithfulness,
    answer_relevancy,
    context_utilization],
    llm=ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [63]:
eval_qe2

{'faithfulness': nan, 'answer_relevancy': nan, 'context_utilization': nan}