In [1]:
# This notebook uses the Google Gemini API (free tier, local API key) 
# and the RAGAS evaluation library to evaluate several metrics for a RAG pipeline
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# RAGAS: https://docs.ragas.io/en/stable/
# Note: This notebook is with RAGAS 2-6 and Python 3.11.8

# Note: I had to edit underlying RAGAS library (cloned locally, edited files, then pip -e installed locally) for this issue re: temperature with Gemini:
# https://github.com/explodinggradients/ragas/pull/657/files
# https://github.com/explodinggradients/ragas/issues/678
# Edits simply remove the temperature variable, see notes.txt for more specific info

In [2]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.

# Faithfulness with HHEM - Similar to Faithfulness but uses a HuggingFace model (Vectara's HHEM 2.1 classifier) to detect hallucinations
# https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html#faithfullness-with-hhem-2-1-model
# https://huggingface.co/vectara/hallucination_evaluation_model

# RAGAS has other metrics as well : https://docs.ragas.io/en/latest/concepts/metrics/index.html

In [1]:
# set do not track variable to RAGAS
# more info: https://github.com/explodinggradients/ragas/issues/49
import os
os.environ["RAGAS_DO_NOT_TRACK"] = "True"

In [2]:
import logging
import sys
import google.generativeai as genai
import textwrap
import ast
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss
import ragas
from ragas.testset import TestsetGenerator
#from ragas.testset.evolution import simple, reasoning, multi_context
from ragas.run_config import RunConfig
#from ragas.metrics import (
#    answer_relevancy,
#    faithfulness,
#    context_recall,
#    context_precision,
#)
from ragas.metrics import (
    Faithfulness,
    LLMContextPrecisionWithoutReference,
    LLMContextPrecisionWithReference, 
    NonLLMContextPrecisionWithReference
)
from ragas import SingleTurnSample, EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from datasets import Dataset

from IPython.display import display
from IPython.display import Markdown

In [3]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [5]:
ragas._analytics.do_not_track()

True

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [7]:
# create document database
# using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online
# Example from 2024:
# https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/
sotu = []
#files = ["./Speeches/state_of_the_union_042921.txt", "./Speeches/state_of_the_union_030122.txt", "./Speeches/state_of_the_union_020723.txt", "./Speeches/state_of_the_union_030724.txt"]
newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]

for i in newfiles:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [8]:
documents = [Document(text=line) for line in sotu]

In [9]:
# Example of a loaded Document line
documents[-1]

Document(id_='235d1f3b-a216-412c-8459-51d27c73c8d0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [7]:
# Set up the faiss index
d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [8]:
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
Settings.llm = llm

In [9]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [15]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)
## save index to disk
#index.storage_context.persist()
#index

In [10]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
#index = load_index_from_storage(storage_context=storage_context)
# index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the speeches with a title that includes the date it was given
index = load_index_from_storage(storage_context=storage_context, index_id='3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5')

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading indices with ids: ['3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5']
Loading indices with ids: ['3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5']


In [11]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7faf185372d0>

In [16]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [None]:
# Example query and response
query = "What has the President done related to healthcare?"
response = query_engine.query(query)

In [18]:
response.response

'The President has taken several actions related to healthcare, including establishing a special sign-up period for the Affordable Care Act, enacting tax credits to reduce health care premiums, and re-igniting the Cancer Moonshot. \n'

In [19]:
# Resulting scores 
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7506474256515503 -> I’m pleased to say that more Americans have health insurance now than ever in history.
0.7599508762359619 -> During these 100 days, an additional 800,000 Americans enrolled in the Affordable Care Act when I established the special sign-up period to do that — 800,000 in that period.
0.7667317390441895 -> The Affordable Care Act has been a lifeline for millions of Americans, protecting people with preexisting conditions, protecting women’s health.  And the pandemic has demonstrated how badly — how badly it’s needed.  Let’s lower deductibles for working families on the Affordable Care — in the Affordable Care Act.  (Applause.)  And let’s lower prescription drug costs.  (Applause.)
0.796398401260376 -> A president, my predecessor, who failed the most basic duty. Any President owes the American people the duty to care.
0.801885724067688 -> Over one hundred million of you can no longer be denied health insurance because of pre-existing conditions.
0.8065693974494934 -> 

In [22]:
# Example of how to embed a sentence with Gemini
#result = genai.embed_content(
#    model="models/text-embedding-004",
#    content="What is the meaning of life?",
#    task_type="retrieval_document",
#    title="Embedding of single string")

In [13]:
# Example of setting up a chat engine with our index
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [20]:
query = "What does the President say about Congress during these 4 years?"
response = chat_engine.chat(query)

In [21]:
print(response.response)

The provided excerpt doesn't give us a clear picture of the President's overall view of Congress during his 4-year term.  However, it does highlight a few key points:

* **He expresses a desire for bipartisanship:**  He states that if they could work together in the last Congress, there's no reason they can't work together in the new one. This suggests a willingness to collaborate with both Republicans and Democrats.
* **He acknowledges their role in the government:** He addresses them directly and respectfully, using titles like "Madam Speaker" and "Mr. Speaker," highlighting their importance in the political process. 
* **He doesn't explicitly criticize Congress:** While he doesn't explicitly praise them, he also doesn't criticize their performance. 

To understand the President's full perspective on Congress during his term, we'd need to look at more of his speeches, actions, and interactions with Congress throughout those four years. 



In [26]:
# Resulting scores
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7417372465133667 -> Throughout our history, Presidents have come to this chamber to speak to Congress, to the nation, and to the world to declare war, to celebrate peace, to announce new plans and possibilities.
0.7774401307106018 -> To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress.
0.7898776531219482 -> So I have come here to fulfil my constitutional duty to report on the state of the union. And here is my report.
0.7941027879714966 -> I promised to be the president for all Americans.
0.8024358153343201 -> Madam Speaker, Madam Vice President — (applause) — no President has ever said those words from this podium.  No President has ever said those words, and it’s about time.  (Applause.)
0.8032354712486267 -> Mr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans.
0.8043006658554077 -> And my report is this: the State of the Union is strong—because you, the American people, are

In [27]:
chat_engine.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What does the President say about Congress during these 4 years?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The excerpt you provided focuses on the President\'s opening remarks to Congress, not on a broader assessment of their performance over four years. \n\nHowever, we can glean some insights:\n\n* **He emphasizes the need for bipartisanship:** He explicitly states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests a desire for collaboration across party lines.\n* **He acknowledges the presence of both Republican and Democratic leaders:** He addresses "Mitch and Chuck," likely referring to Mitch McConnell and Chuck Schumer, the leaders of the Republican and Democratic parties in the Senate, respectively. This implies a recognition of the different political forces at play.\n\nHowev

In [17]:
# Start of RAGAS implementation

In [18]:
# Generate synthetic test data
# Note: When generating a synthetic test dataset, the columns generated are 
# 'question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'
# Ground truth is supposed to be the 'human' level answer vs the RAG answer
# We have to generate the answer separately with our RAG, which then (obviously) generates new context used.
# My best guess is to use the context that was used to generate the answer for the metrics calculation.
# Thus, in the below example, I keep the old contexts column, but for evaluating RAG, I use the new context to calculate the metrics.
# I can't find good documentation to confirm this; the following issues are close:
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084

# Possible future edit:
# The best thing to do would be to generate the answer when creating the synthetic test dataset, but this is no longer done (perhaps done in RAGAS 1.0?)

# Also, a relevant warning from the same issue 1084:
# "Since you use the same LLM to generate your synthetic dataset ground_truth and your answer, 
# I think the results of this evaluation might be biased. I haven't realized a comparative study 
# but it might be an issue which could have an impact on your interpretation."

In [11]:
# Load documents for use in generating testset with RAGAS
loader = DirectoryLoader("./Speeches/titleedits")
documents = loader.load()

In [12]:
documents[3].metadata

{'source': 'Speeches/titleedits/state_of_the_union_042921.txt'}

In [13]:
# Need to add 'filename' metadata for RAGAS
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [14]:
# generator with Gemini models
generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=240, max_output_tokens=8192) #, temperature=0.7, timeout=180, transport="rest"
critic_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", timeout=240) #timeout=180
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", timeout=240) #transport="rest" #, request_options={"timeout": 10} #, request_options={"maxConcurrency": 5}

In [15]:
generator_wrapper = LangchainLLMWrapper(generator_llm)
embeddings_wrapper = LangchainEmbeddingsWrapper(embeddings)

In [20]:
# Not working with RAGAS 2.6
generator = TestsetGenerator(llm=generator_wrapper, embedding_model=embeddings_wrapper)
# Increase the timeout settings
my_run_config = RunConfig(timeout=120, max_wait=70, max_workers=1)  # Increase timeout to 120 seconds
dataset = generator.generate_with_langchain_docs(documents, testset_size=10, run_config=my_run_config) # stopped here

Applying HeadlinesExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not com

Applying HeadlineSplitter:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
unable to apply transformation: The LLM generation was not completed. Please increase try increasing the max_tokens and try again.
ERROR:ragas.testset.transforms.engine:unable to apply transformation: The LLM generation was not com

Applying CustomNodeFilter: 0it [00:00, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: Node 06e5b239-3c97-4ab3-84a4-6d819df54e27 has no summary_embedding
unable to apply transformation: Node 06e5b239-3c97-4ab3-84a4-6d819df54e27 has no summary_embedding
INFO:ragas.testset.synthesizers.multi_hop.abstract:found 0 clusters
found 0 clusters
INFO:ragas.testset.synthesizers.multi_hop.specific:found 0 clusters
found 0 clusters


ValueError: No nodes that satisfied the given filer. Try changing the filter.

In [21]:
# generate testset
def generate_testset_rate(docs):
    """
    Calls the model and embeddings with rate limit run_config
    """
    testset = generator.generate_with_langchain_docs(docs, test_size=50, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, raise_exceptions=True, run_config=run_config, is_async=False)
    return testset

In [None]:
# The RAGAS internal RunConfig settings do a decent job at limiting the 429 resource exhausted warnings when max_workers=1
# Still very difficult to have this run successfully
# Tried ratelimit and backoff libraries in Python... didn't affect it enough, got so many warnings that it wouldn't finish
# Occasionally even the 1 max worker will not finish, but it will finish sometimes
testset = generate_testset_rate(documents) 

In [43]:
testset

NameError: name 'testset' is not defined

In [34]:
testset_pd = testset.to_pandas()

In [35]:
testset_pd

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True


In [36]:
# testset_pd.to_csv('datasets/testset_flash_pro15.csv', index=False)

In [23]:
# Below code uses the resulting testset without generated answers to generate new answers

#  use if imported testset_pd from csv without answers
# this is a fix for 'contexts' column being saved as a string; needs to be a list
testset_pd = pd.read_csv("datasets/testset_flash_pro15.csv", index_col = None)
testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [None]:
# generate answer column, per these two issues
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084#issuecomment-2248219601

query_engine = index.as_query_engine(similarity_top_k=10)
answers = [query_engine.query(q) for q in testset_pd['question']]

In [25]:
# parse out new 'answer' and 'contexts' columns
answers_r = []
context_n = []
for i in answers:
    answers_r.append(i.response)
    context_n.append([c.node.get_content() for c in i.source_nodes])

testset_pd = testset_pd.rename(columns={"contexts":"contexts_gt"})
testset_pd['contexts'] = context_n
testset_pd['answer'] = answers_r

In [86]:
#testset_pd.to_csv('datasets/testset_answer_newcontext_flash_pro15.csv', index=False)

In [16]:
testset_pd = pd.read_csv("datasets/unlabeled_dataset/unlabeled_dataset.csv", index_col = None) # datasets/manual_dataset_complete.csv # testset_answer_newcontext_flash_pro15.csv

In [17]:
testset_pd = testset_pd.rename(columns={"Query": "user_input", "Answer": "response", "Expected_Output": "reference", "Contexts": "orig_contexts", "Source_File":"source_file", "Document": "retrieved_contexts"})

In [18]:
testset_pd

Unnamed: 0,user_input,response,reference,orig_contexts,source_file,retrieved_contexts
0,Identify specific examples of government inves...,The transcontinental railroad and the intersta...,The speech highlights several examples: the tr...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,"['Throughout our history, if you think about i..."
1,"Does the American Jobs Plan, a large-scale inv...",The plan seeks to create jobs by modernizing i...,The American Jobs Plan aims to create jobs by ...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,['The American Jobs Plan creates jobs replacin...
2,Considering the significant impact of cancer o...,Investing in cancer research is a priority bec...,Investing in cancer research is a priority bec...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"['But so many of us have deceased sons, daught..."
3,How does the President's viewpoint on infrastr...,The President believes that infrastructure inv...,The President emphasizes that infrastructure i...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"['Investments in jobs and infrastructure, like..."
4,Analyze the potential economic consequences of...,"A progressive tax structure, where higher earn...",The President advocates for raising taxes on c...,you should be able to become a billionaire an...,Speeches/titleedits/state_of_the_union_042921.txt,['When you hear someone say that they don’t wa...
...,...,...,...,...,...,...
795,Explain how the president's call for aid to Ga...,The president emphasizes the importance of inc...,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
796,"If a ceasefire were to fail, how would the Pre...",The President's proposed humanitarian efforts ...,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
797,What steps is the US President taking to achie...,The US President is directing the military to ...,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
798,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"['Like most Americans, I believe Roe v. Wade g..."


In [19]:
# use if imported testset_pd from csv
# this is a fix for 'contexts' column being saved as a string; needs to be a list

testset_pd['retrieved_contexts'] = testset_pd['retrieved_contexts'].apply(ast.literal_eval)
#testset_pd['contexts_gt'] = testset_pd['contexts_gt'].apply(ast.literal_eval)

In [20]:
testset_pd

Unnamed: 0,user_input,response,reference,orig_contexts,source_file,retrieved_contexts
0,Identify specific examples of government inves...,The transcontinental railroad and the intersta...,The speech highlights several examples: the tr...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,"[Throughout our history, if you think about it..."
1,"Does the American Jobs Plan, a large-scale inv...",The plan seeks to create jobs by modernizing i...,The American Jobs Plan aims to create jobs by ...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,[The American Jobs Plan creates jobs replacing...
2,Considering the significant impact of cancer o...,Investing in cancer research is a priority bec...,Investing in cancer research is a priority bec...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"[But so many of us have deceased sons, daughte..."
3,How does the President's viewpoint on infrastr...,The President believes that infrastructure inv...,The President emphasizes that infrastructure i...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"[Investments in jobs and infrastructure, like ..."
4,Analyze the potential economic consequences of...,"A progressive tax structure, where higher earn...",The President advocates for raising taxes on c...,you should be able to become a billionaire an...,Speeches/titleedits/state_of_the_union_042921.txt,[When you hear someone say that they don’t wan...
...,...,...,...,...,...,...
795,Explain how the president's call for aid to Ga...,The president emphasizes the importance of inc...,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
796,"If a ceasefire were to fail, how would the Pre...",The President's proposed humanitarian efforts ...,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
797,What steps is the US President taking to achie...,The US President is directing the military to ...,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
798,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"[Like most Americans, I believe Roe v. Wade go..."


In [21]:
# per this ragas thread, need to convert pandas testset to Dataset format for evaluate to work
# https://github.com/explodinggradients/ragas/issues/803
#testset_ds = Dataset.from_pandas(testset_pd.drop("contexts_gt", axis=1))
testset_ds = Dataset.from_pandas(testset_pd.drop("orig_contexts", axis=1))

In [22]:
testset_ds

Dataset({
    features: ['user_input', 'response', 'reference', 'source_file', 'retrieved_contexts'],
    num_rows: 800
})

In [23]:
eval_dataset = EvaluationDataset.from_hf_dataset(testset_ds)

In [24]:
eval_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=800)

In [39]:
# Note: I'm using the normal LLM, not the RAG context-loaded query engine
# There is code at the bottom of the notebook for using the query engine, which should be the way to go
# However, that code appears to be broken from RAGAS right now, so I was forced to use the regular Gemini LLM
# It appears that evaluate (below) may re-run the query and gives new answers and contexts anyway...
# Watch this issue: https://github.com/explodinggradients/ragas/issues/1211

ragas_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=120) # try request_timeout=120
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [25]:
# Increase the timeout settings
my_run_config = RunConfig(timeout=120, max_wait=70, max_workers=1)  # Increase timeout to 120 seconds
#run_config = RunConfig(timeout=600, max_wait=600, max_workers=1, max_retries=10)  # Increase timeout to 180 seconds

In [26]:
context_precision = LLMContextPrecisionWithoutReference(llm=generator_wrapper)

In [27]:
# old method
# Optional parameter: 
# in_ci: bool, Whether the evaluation is running in CI or not. 
# If set to True then some metrics will be run to increase the reproducability of the evaluations. 
# This will increase the runtime and cost of evaluations. Default is False.
evalresult = evaluate(
    metrics = [
        context_precision
        #faithfulness,
        #answer_relevancy,
        #context_recall
    ],
    dataset = testset_ds,
    llm = generator_wrapper,
    embeddings=embeddings_wrapper,
    run_config=my_run_config
)

Evaluating:   0%|          | 0/800 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[0]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[0]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.executor:Exception raised in Job[1]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[1]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.executor:Exception raised in Job[2]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[2]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.executo

KeyboardInterrupt: 

ERROR:ragas.executor:Exception raised in Job[8]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[8]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.executor:Exception raised in Job[9]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[9]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.executor:Exception raised in Job[10]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[10]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
ERROR:ragas.execu

In [43]:
evalresult

{'context_precision': 0.6944, 'faithfulness': 1.0000, 'answer_relevancy': 0.6770, 'context_recall': 0.6455}

In [61]:
# Example result:
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
# Reran
# {'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}
evalresult

{'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

In [None]:
# Note: received warning for the answer where there was no response from the llm, definitely reduced faithfulness score

# Results:
# Using contexts generated when produced answers from LLM:
# new testset_answer_newcontext_flash_pro15.csv result, with contexts_gt (aka contexts generated with ground truth generation) column removed
# {'context_precision': 0.4171, 'faithfulness': 0.9167, 'answer_relevancy': 0.6509, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

# Compared to using contexts generated for ground truth (probably not correct):
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}

In [None]:
# Evaluation results on metrics:

# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics
# I don't have example ranges to compare anything to, so below is my best guess.

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# 0.9167 - 1.0000 indicates that the LLM is staying true to the facts provided in the context for answering the question.
# There is another Faithfulness metric: from ragas.metrics import FaithulnesswithHHEM
# This uses a huggingface model to help detect hallucination : https://huggingface.co/vectara/hallucination_evaluation_model
# See below for code : {'faithfulness_with_hhem': 0.6319} 
# This doesn't really agree with the RAGAS faithfulness score... may need to dive in further another time.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# At 0.4171 - 0.5979, suggests that the context isn't particularly relevant to the question.
# Answer_relevancy - Measures how relevant the answer is to the question.
# 0.6509 - 0.6533 seems moderately low, just going off of the number.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.
# 0.8 indicates that the llm context is decently good and can typically answer the question or most of it. 

In [105]:
# Test run, just compare to using contexts_gt column instead of the newer context generated with the answer
testset_ds_oldcontext = Dataset.from_pandas(testset_pd.drop("contexts", axis=1).rename(columns={'contexts_old':'contexts'}))

In [None]:
evalresult_old2 = evaluate(
    testset_ds_oldcontext,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [None]:
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}
evalresult_old2

In [None]:
# RAGAS also has an additional Faithfulness with HHEM metric (yes- it is misspelled in their documentation) 
# that uses a HuggingFace model to detect hallucinations
# Note: There's a message on HuggingFace about the token indices sequence length error being normal and an artifact; thus, ignoring the below error
# https://huggingface.co/vectara/hallucination_evaluation_model
from ragas.metrics import FaithulnesswithHHEM
faithfulness_with_hhem = FaithulnesswithHHEM()
result_faithfulness_hhem = evaluate(
    testset_ds,
    metrics=[faithfulness_with_hhem],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [158]:
# with context from answer generation:
# {'faithfulness_with_hhem': 0.6319}
# testing: with context from ground truth/synthetic testset generation
# {'faithfulness_with_hhem': 0.5241}
# this seems to agree with the RAGAS faithfulness score in that answers seem to be partially made up.
result_faithfulness_hhem

{'faithfulness_with_hhem': 0.6319}

In [84]:
# Extra non-working code:

In [85]:
# Code to use the query_engine in the evaluation 
# Modeled after this tutorial: https://docs.ragas.io/en/latest/howtos/applications/compare_llms.html

# Does not currently work: for some metrics, it is not finding the 'ground_truth' column in the dataset
# For other metrics, appears to run but returns the below errors and returns 'nan' for results

In [64]:
# start of testing to try and get rag query engine for evaluate
def generate_responses(query_engine, test_questions, test_answers):
  responses = [query_engine.query(q) for q in test_questions]

  answers = []
  contexts = []
  for r in responses:
    answers.append(r.response)
    contexts.append([c.node.get_content() for c in r.source_nodes])
  dataset_dict = {
        "question": test_questions,
        "answer": answers,
        "contexts": contexts,
  }
  if test_answers is not None:
    dataset_dict["ground_truth"] = test_answers
  ds = Dataset.from_dict(dataset_dict)
  return ds

test_questions = testset_pd['question'].values.tolist()
test_answers = [[item] for item in testset_pd['answer'].values.tolist()]

result_ds = generate_responses(query_engine, test_questions, test_answers)

In [None]:
# Note: This evaluate function that uses the query_engine does not return results (nan for all metrics)
# Errors (below are repeated many times):
# WARNING:ragas.llms.base:n values greater than 1 not support for LlamaIndex LLMs
# n values greater than 1 not support for LlamaIndex LLMs
# INFO:ragas.llms.base:callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
# Exception raised in Job[5]: TimeoutError()
# ERROR:ragas.executor:Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')
# Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')

from ragas.integrations.llama_index import evaluate

eval_qe2 = evaluate(
    query_engine=query_engine,
    dataset=result_ds,
    metrics=[faithfulness,
    answer_relevancy,
    context_utilization],
    llm=ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [63]:
eval_qe2

{'faithfulness': nan, 'answer_relevancy': nan, 'context_utilization': nan}