In [1]:
# This notebook uses the RAGAS evaluation library to evaluate several metrics for a RAG pipeline
# I use the Google Gemini API (free tier, local API key), but RAGAS is compatible with several LLMs

# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# RAGAS: https://docs.ragas.io/en/stable/
# I used RAGAS v0.1.15 (mine was dev version from Github, but 0.1.15 from PyPI should work fine). 

# Note: I had to edit underlying RAGAS library (cloned locally, edited files, then pip -e installed locally) for this issue re: temperature with Gemini:
# https://github.com/explodinggradients/ragas/pull/657/files
# https://github.com/explodinggradients/ragas/issues/678
# Edits simply remove the temperature variable from the relevant source files; see notes.txt for more specific info

# RAGAS had a large redesign from version 0.1 to 0.2 : https://docs.ragas.io/en/stable/howtos/migrations/migrate_from_v01_to_v02/
# - Note that on RAGAS version 0.2.6, the temperature edits did not make RAGAS compatible with Gemini.
# - There are open Github issues about Gemini's compatibility for RAGAS version 2+; more time and edits would be needed.

In [2]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.

# Faithfulness with HHEM - Similar to Faithfulness but uses a HuggingFace model (Vectara's HHEM 2.1 classifier) to detect hallucinations
# https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html#faithfullness-with-hhem-2-1-model
# https://huggingface.co/vectara/hallucination_evaluation_model

# RAGAS has other metrics as well : https://docs.ragas.io/en/latest/concepts/metrics/index.html

In [1]:
# Set do not track variable for RAGAS
# More info: https://github.com/explodinggradients/ragas/issues/49
import os
os.environ["RAGAS_DO_NOT_TRACK"] = "True"

In [2]:
import logging
import sys
import textwrap
import ast
import time
import pandas as pd
from IPython.display import display
from IPython.display import Markdown

# Replace these two Google Gemini imports with imports for your LLM
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

import ragas
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas.run_config import RunConfig
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from datasets import Dataset

In [3]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [4]:
# Double check the RAGAS do not track setting
ragas._analytics.do_not_track()

True

In [5]:
# Set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [None]:
# Establish RAG pipeline with Gemini

In [7]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# # Example of creating a small vector store
# # Using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, edited to include a title with the date of the speech
# # Example from 2024:
# # https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/

# # load and parse files
# sotu = []
# newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
# for i in newfiles:
#     with open(i) as file:
#         for line in file:
#             nl = line.rstrip()
#             if nl != '':
#                 sotu.append(nl)

# # convert into Document format
# documents = [Document(text=line) for line in sotu]

In [9]:
# # Example of a loaded Document line
# documents[-1]

Document(id_='235d1f3b-a216-412c-8459-51d27c73c8d0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [None]:
# # Set up the faiss index
# d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
# faiss_index = faiss.IndexFlatL2(d)
# print(faiss_index.is_trained) # double check that the training worked

In [6]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # Replace with your embeddings model
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [7]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [8]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, show_progress=True
# )

# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# index.index_id

In [8]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the 4 speeches including a title that includes the date it was given
index = load_index_from_storage(storage_context=storage_context, index_id='3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5')

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading indices with ids: ['3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5']
Loading indices with ids: ['3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5']


In [11]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
# query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [None]:
# # Example query and response with Gemini and query_engine
# query = "What has the President done related to healthcare?"
# response = query_engine.query(query) 
# print(response.response)

In [None]:
# # Get ranked scores for top k RAG source nodes
# for node in response.source_nodes:
#     print(f"{node.get_score()} -> {node.text}")

In [None]:
# # Example of using the chat engine with our index
# query = "You are an expert speech analyst and specialize in analyzing Presidential State of the Union speeches. Could you please analyze the speeches and generate 2 questions and answers from each speech, providing the document filename of each speech that relates to each question?"
# response = chat_engine.chat(query) 
# print(response.response)

In [22]:
# # View chat history
# chat_engine.chat_history

In [20]:
# Code for RAGAS evaluation library to work with Gemini and our local RAG setup

In [21]:
# Example of generating synthetic dataset with RAGAS

# In a synthetic dataset, columns generated are 'question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', and 'episode_done'
# Ground truth is supposed to be the 'human' level answer vs the RAG answer

# Notes: 
# - We have to generate the answer separately with our RAG, which then generates new context used.
# - I use the context that was used to generate the answer for the metrics calculation, while still saving the old contexts column.
# - The best thing to do would be to generate the answer when creating the synthetic test dataset, but this is not available in RAGAS.
# - From a Github issue: Since you use the same LLM to generate your synthetic dataset ground_truth and your answer, 
# - it is possible the results of the RAG evaluation might be biased. This has not been studied.

In [10]:
# Load documents for use in generating synthetic dataset with RAGAS
loader = DirectoryLoader("./Speeches/titleedits") # Loads all docs in the directory; there are parameters for ignoring or matching certain files
documents = loader.load()

In [11]:
# Need to add 'filename' metadata for RAGAS to process documents
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [38]:
# Synthetic testset generator with Gemini models
generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=240) # Other notable parameters: temperature=0.7, transport="rest"
critic_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", timeout=240) 
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", request_options={"timeout": 240}) 

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [39]:
# Increase the timeout settings with RAGAS's RunConfig class

# Note: For Gemini, the RAGAS internal RunConfig settings do a decent job at limiting the 429 resource exhausted warnings
# (max_workers=1 still can send more requests to Gemini than the 15 requests per minute it allows)
# Still very difficult to have the testset generation run successfully with Gemini free tier
# I also tried the ratelimit and backoff libraries in Python, but I still got so many 429 warnings that the generation failed
# Sometimes even the 1 max worker will not finish, but it will finish occasionally

run_config = RunConfig(timeout=240, max_retries=20, max_wait=240, max_workers=1)

In [21]:
# Generate the synthetic dataset/testset
def generate_testset_rate(docs):
    """
    Calls the LLM and embeddings model to generate the synthetic dataset with rate limit run_config
    Can change the distribution of simple, reasoning, and multi-context questions generated
    """
    testset = generator.generate_with_langchain_docs(documents=documents, # LangChain source documents
                                                     test_size=50, # number of test samples to generate
                                                     distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, 
                                                     is_async=False,
                                                     raise_exceptions=True, 
                                                     run_config=run_config)
    return testset

In [None]:
testset = generate_testset_rate(documents=documents, run_config=run_config)  # edited to include run_config, ToDo: Test this
testset_pd = testset.to_pandas()

In [34]:
# # Save generated testset to csv 
# testset_pd.to_csv('datasets/testset_flash_pro15.csv', index=False)

In [23]:
# Now generate answers for the testset, as answers are not automatically generated at creation

# testset_pd = pd.read_csv("datasets/testset_flash_pro15.csv", index_col = None)

# Note: When saving, the 'contexts' column is saved as a string but needs to be a list
# If you are importing testset_pd from a csv file, use the below code to change the column to a list

# testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [None]:
# Generate answers
query_engine = index.as_query_engine(similarity_top_k=10)
answers = [query_engine.query(q) for q in testset_pd['question']]

In [25]:
# Parse out new 'answer' and 'contexts' columns
answers_new = []
context_new = []
for i in answers:
    answers_new.append(i.response)
    context_new.append([c.node.get_content() for c in i.source_nodes])

testset_pd = testset_pd.rename(columns={"contexts":"contexts_gt"}) # Keeping old contexts that were used for testset/query generation (gt = ground truth)
testset_pd['contexts'] = context_new
testset_pd['answer'] = answers_new

# Save complete synthetically created dataset/testset
# testset_pd.to_csv('datasets/ragas_full_testset_flash_pro15.csv', index=False)

In [None]:
# Evaluate a dataset with RAGAS

In [15]:
# Read in dataset for evaluation
testset_pd = pd.read_csv("datasets/unlabeled_dataset/unlabeled_dataset.csv", index_col = None) 

# RAGAS expects the following columns (rename in dataset as needed) : "question", "answer", "ground_truth", "source_file", "contexts"
testset_pd = testset_pd.rename(columns={"Query": "question", "Answer": "answer", "Expected_Output": "ground_truth", "Contexts": "orig_contexts", "Source_File":"source_file", "Document": "contexts"})

# Note: When saving a synthetic dataset, the 'contexts' column is saved as a string but needs to be a list for evaluation
# If you are importing testset_pd from a csv file, use the below code to change the column to a list
testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [16]:
testset_pd

Unnamed: 0,question,answer,ground_truth,orig_contexts,source_file,contexts
0,Identify specific examples of government inves...,The transcontinental railroad and the intersta...,The speech highlights several examples: the tr...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,"[Throughout our history, if you think about it..."
1,"Does the American Jobs Plan, a large-scale inv...",The plan seeks to create jobs by modernizing i...,The American Jobs Plan aims to create jobs by ...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,[The American Jobs Plan creates jobs replacing...
2,Considering the significant impact of cancer o...,Investing in cancer research is a priority bec...,Investing in cancer research is a priority bec...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"[But so many of us have deceased sons, daughte..."
3,How does the President's viewpoint on infrastr...,The President believes that infrastructure inv...,The President emphasizes that infrastructure i...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"[Investments in jobs and infrastructure, like ..."
4,Analyze the potential economic consequences of...,"A progressive tax structure, where higher earn...",The President advocates for raising taxes on c...,you should be able to become a billionaire an...,Speeches/titleedits/state_of_the_union_042921.txt,[When you hear someone say that they don’t wan...
...,...,...,...,...,...,...
795,Explain how the president's call for aid to Ga...,The president emphasizes the importance of inc...,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
796,"If a ceasefire were to fail, how would the Pre...",The President's proposed humanitarian efforts ...,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
797,What steps is the US President taking to achie...,The US President is directing the military to ...,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
798,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"[Like most Americans, I believe Roe v. Wade go..."


In [17]:
# At least for RAGAS v0.1.15, we need to convert the pandas testset into Dataset format for the evaluate function to work
# Note: I am also dropping the original contexts column here
testset_ds = Dataset.from_pandas(testset_pd.drop("orig_contexts", axis=1))

In [18]:
testset_ds

Dataset({
    features: ['question', 'answer', 'ground_truth', 'source_file', 'contexts'],
    num_rows: 800
})

In [19]:
# Note: I'm using the normal LLM, not the RAG context-loaded query engine
# There is code at the bottom of the notebook for using the query engine, which should be the way to go
# However, that code appears to be broken from RAGAS right now, so I was forced to use the regular Gemini LLM

# Note: The RAGAS evaluate function (below) may re-run the query and give new answers and contexts
# See this issue: https://github.com/explodinggradients/ragas/issues/1211
# In testing, the results still output the same answers and contexts as I started with, so I'm not concerned by this

ragas_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=600) 
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [20]:
# Increase the timeout settings
run_config = RunConfig(timeout=300, max_wait=3000, max_workers=1, max_retries=10)

In [None]:
# Two coding options for running evaluate:
# 1) Bulk run with the evaluate function, as intended. 
# Unfortunately rate-limiting does not work well with this, 
# and 2/3 of my 800 example dataset received NaN results because of rate limiting issues.
# Ex: Evaluating 1 example for 1 metric resulted in 10 API calls.

# 2) Run the evaluation in small batches
# This allowed me to finish the evaluation of the entire dataset without rate limiting errors.

# Bulk evaluation of the dataset
evalresult = evaluate(
    metrics = [
        context_precision
        faithfulness,
        answer_relevancy,
        context_recall
    ],
    dataset = testset_ds,
    llm = ragas_llm,
    embeddings=embeddings,
    run_config=run_config
)

# Optional parameter: in_ci: bool, Whether the evaluation is running in CI or not. 
# If set to True then some metrics will be run to increase the reproducability of the evaluations. 
# This will increase the runtime and cost of evaluations. Default is False.
# In practice, setting in_ci = True resulted in a lot of timeouts / no score calculated / NaN

In [61]:
# Example result:
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
evalresult

{'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

In [34]:
# evalresult.to_pandas() # Returns the dataset with scores for each example

In [21]:
testset_results = pd.DataFrame()

In [22]:
scores = []

In [23]:
# Iterate through dataset for smaller batches to be evaluated
for i in range(747,800):  # Choose the batch size in the range selection
    tempdataset = testset_ds.select(range(i, i+1)) # Choose the batch size in the range selection
    print(i)
    evalresult = evaluate(
        metrics = [
            context_precision
            faithfulness,
            answer_relevancy,
            context_recall
        ],
        dataset = tempdataset,
        llm = ragas_llm,
        embeddings=embeddings,
        run_config=run_config
    )
    scores.append(evalresult['context_precision'])
    testset_results = pd.concat([testset_results, evalresult.to_pandas()])
    time.sleep(60)

747


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

748


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

749


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

750


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

751


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

752


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

753


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

754


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

755


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

756


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

757


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

758


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

759


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

760


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

761


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

762


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

763


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

764


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

765


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

766


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

767


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

768


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

769


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

770


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

771


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

772


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

773


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

774


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

775


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

776


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

777


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

778


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

779


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

780


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

781


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

782


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

783


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

784


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

785


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

786


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

787


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

788


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

789


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

790


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

791


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

792


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

793


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

794


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

795


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

796


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

797


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

798


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

799


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
evalresult.to_pandas()

Unnamed: 0,question,answer,ground_truth,source_file,contexts,context_precision
0,Compare the American Jobs Plan with the Americ...,The American Jobs Plan focuses on creating job...,The American Jobs Plan focuses on infrastructu...,Speeches/titleedits/state_of_the_union_042921.txt,"[Look, the American Jobs Plan will help millio...",0.0


In [25]:
#testset_results = pd.concat([testset_results, evalresult.to_pandas()])
pd.DataFrame(scores, index=None).mean()

0    0.287356
dtype: float64

In [24]:
len(testset_results)

53

In [26]:
testset_results

Unnamed: 0,question,answer,ground_truth,source_file,contexts,context_precision
0,Compare the president's claims of economic pro...,The president highlights economic progress ach...,The president emphasizes positive economic ind...,Speeches/titleedits/state_of_the_union_030724.txt,"[To my friends across the aisle, don’t keep fa...",0.0
0,"President Biden, in his State of the Union Add...","Yes, that is correct. \n","President Biden, in his State of the Union Add...",Speeches/titleedits/state_of_the_union_030724.txt,[Not since President Lincoln and the Civil War...,0.0
0,Summarize President Biden's message to Preside...,President Biden's message to President Putin i...,President Biden's message to President Putin i...,Speeches/titleedits/state_of_the_union_030724.txt,"[My message to President Putin is simple., I s...",0.0
0,Imagine the impact on voting rights if the eve...,The political landscape today could be vastly ...,"The absence of the Voting Rights Act, born fro...",Speeches/titleedits/state_of_the_union_030724.txt,[to help shake the nation’s conscience. Five m...,0.833333
0,Explain the connection between the Selma march...,A pivotal event in the fight for voting rights...,"The Selma march, a pivotal moment in the Civil...",Speeches/titleedits/state_of_the_union_030724.txt,[A transformational moment in our history happ...,0.25
0,Identify the economic achievements mentioned i...,The speech highlights significant economic gro...,The speech highlights several economic achieve...,Speeches/titleedits/state_of_the_union_030724.txt,[With historic job growth and small business g...,0.111111
0,How does the emphasis on 'Buy American' and th...,"The president's focus on ""Buy American"" and th...","The emphasis on 'Buy American,' the creation o...",Speeches/titleedits/state_of_the_union_030724.txt,[Well instead of having to import semiconducto...,0.866667
0,"How are Pell Grants, increased investments in ...",The speaker is advocating for policies that ai...,The speaker mentions increasing Pell Grants fo...,Speeches/titleedits/state_of_the_union_030724.txt,[Let’s continue increasing Pell Grants for wor...,1.0
0,How does the speaker's stance on fair taxation...,The speaker believes that wealthy individuals ...,The speaker believes in fair taxation for the ...,Speeches/titleedits/state_of_the_union_030724.txt,[That’s why I’ve proposed a minimum tax of 25%...,0.514286
0,Compare the speaker's belief in America and th...,The speaker expresses unwavering faith in the ...,The speaker strongly believes in both America ...,Speeches/titleedits/state_of_the_union_030724.txt,"[I believe in you the American people., My fel...",0.0


In [27]:
testset_results.to_csv("results/results_ragas_unlabeled_smallruns_6.csv", index=False)

In [33]:
testset_results1 = pd.read_csv("results/results_ragas_unlabeled_smallruns_1.csv", index_col=None)

In [34]:
testset_results2 = pd.read_csv("results/results_ragas_unlabeled_smallruns_2.csv", index_col=None)

In [35]:
testset_results3 = pd.read_csv("results/results_ragas_unlabeled_smallruns_3.csv", index_col=None)

In [36]:
testset_results4 = pd.read_csv("results/results_ragas_unlabeled_smallruns_4.csv", index_col=None)

In [37]:
testset_results5 = pd.read_csv("results/results_ragas_unlabeled_smallruns_5.csv", index_col=None)

In [38]:
testset_results6 = pd.read_csv("results/results_ragas_unlabeled_smallruns_6.csv", index_col=None)

In [40]:
testset_results_all = pd.concat([testset_results1, testset_results2, testset_results3, testset_results4, testset_results5, testset_results6], ignore_index=True)

In [42]:
testset_results_all['contexts'] = testset_results_all['contexts'].apply(ast.literal_eval)

In [49]:
testset_results_all.to_csv("results/results_ragas_unlabeled.csv", index=False)

In [58]:
testset_results_all[testset_results_all['context_precision'] == 0]

Unnamed: 0,question,answer,ground_truth,source_file,contexts,context_precision
1,"Does the American Jobs Plan, a large-scale inv...",The plan seeks to create jobs by modernizing i...,The American Jobs Plan aims to create jobs by ...,Speeches/titleedits/state_of_the_union_042921.txt,[The American Jobs Plan creates jobs replacing...,0.0
4,Analyze the potential economic consequences of...,"A progressive tax structure, where higher earn...",The President advocates for raising taxes on c...,Speeches/titleedits/state_of_the_union_042921.txt,[When you hear someone say that they don’t wan...,0.0
6,Compare the author's proposed investment in th...,The author emphasizes the American Jobs Plan a...,The author draws parallels between the propose...,Speeches/titleedits/state_of_the_union_042921.txt,[That’s why I proposed the American Jobs Plan ...,0.0
7,Using examples of government-funded initiative...,"Yes, the author provides several examples of g...",The author provides various examples of govern...,Speeches/titleedits/state_of_the_union_042921.txt,"[Throughout our history, if you think about it...",0.0
9,Explain how the Violence Against Women Act add...,Empty Response,"The Violence Against Women Act, which has been...",Speeches/titleedits/state_of_the_union_042921.txt,[Another thing: Let’s authorize the Violence A...,0.0
...,...,...,...,...,...,...
790,How does the speaker's plan to lower prescript...,The speaker's plan to lower prescription drug ...,The speaker's plan lowers prescription drug co...,Speeches/titleedits/state_of_the_union_030724.txt,[For years people have talked about it but I f...,0.0
791,What policies is the speaker advocating for to...,The speaker is advocating for policies that wo...,The speaker advocates for policies to help the...,Speeches/titleedits/state_of_the_union_030724.txt,[Now I want to cap prescription drug costs at ...,0.0
793,Analyze the impact of the Voting Rights Act on...,The provided text focuses on the historical si...,The speech highlights the importance of the Vo...,Speeches/titleedits/state_of_the_union_030724.txt,[to help shake the nation’s conscience. Five m...,0.0
798,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,Speeches/titleedits/state_of_the_union_030724.txt,"[Like most Americans, I believe Roe v. Wade go...",0.0


In [32]:
tempdataset['question']

['What steps is the US President taking to achieve a ceasefire in the Israeli-Palestinian conflict, including the deployment of humanitarian aid and the prioritization of protecting innocent civilians, while also advocating for a two-state solution as the long-term path to peace?',
 "Identify the central theme of the President's opening remarks regarding reproductive rights.",
 "Analyze the speech's rhetorical strategies in defending reproductive rights amidst legal and social backlash."]

In [None]:
# Note: received warning for the answer where there was no response from the llm, definitely reduced faithfulness score

# Results:
# Using contexts generated when produced answers from LLM:
# new testset_answer_newcontext_flash_pro15.csv result, with contexts_gt (aka contexts generated with ground truth generation) column removed
# {'context_precision': 0.4171, 'faithfulness': 0.9167, 'answer_relevancy': 0.6509, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.5979, 'faithfulness': 1.0000, 'answer_relevancy': 0.6533, 'context_recall': 0.8000}

# Compared to using contexts generated for ground truth (probably not correct):
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}

In [None]:
# Evaluation results on metrics:

# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics
# I don't have example ranges to compare anything to, so below is my best guess.

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# 0.9167 - 1.0000 indicates that the LLM is staying true to the facts provided in the context for answering the question.
# There is another Faithfulness metric: from ragas.metrics import FaithulnesswithHHEM
# This uses a huggingface model to help detect hallucination : https://huggingface.co/vectara/hallucination_evaluation_model
# See below for code : {'faithfulness_with_hhem': 0.6319} 
# This doesn't really agree with the RAGAS faithfulness score... may need to dive in further another time.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# At 0.4171 - 0.5979, suggests that the context isn't particularly relevant to the question.
# Answer_relevancy - Measures how relevant the answer is to the question.
# 0.6509 - 0.6533 seems moderately low, just going off of the number.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.
# 0.8 indicates that the llm context is decently good and can typically answer the question or most of it. 

In [105]:
# Test run, just compare to using contexts_gt column instead of the newer context generated with the answer
testset_ds_oldcontext = Dataset.from_pandas(testset_pd.drop("contexts", axis=1).rename(columns={'contexts_old':'contexts'}))

In [None]:
evalresult_old2 = evaluate(
    testset_ds_oldcontext,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [None]:
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}
evalresult_old2

In [None]:
# RAGAS also has an additional Faithfulness with HHEM metric (yes- it is misspelled in their documentation) 
# that uses a HuggingFace model to detect hallucinations
# Note: There's a message on HuggingFace about the token indices sequence length error being normal and an artifact; thus, ignoring the below error
# https://huggingface.co/vectara/hallucination_evaluation_model
from ragas.metrics import FaithulnesswithHHEM
faithfulness_with_hhem = FaithulnesswithHHEM()
result_faithfulness_hhem = evaluate(
    testset_ds,
    metrics=[faithfulness_with_hhem],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [158]:
# with context from answer generation:
# {'faithfulness_with_hhem': 0.6319}
# testing: with context from ground truth/synthetic testset generation
# {'faithfulness_with_hhem': 0.5241}
# this seems to agree with the RAGAS faithfulness score in that answers seem to be partially made up.
result_faithfulness_hhem

{'faithfulness_with_hhem': 0.6319}

In [84]:
# Extra non-working code:

In [85]:
# Code to use the query_engine in the evaluation 
# Modeled after this tutorial: https://docs.ragas.io/en/latest/howtos/applications/compare_llms.html

# Does not currently work: for some metrics, it is not finding the 'ground_truth' column in the dataset
# For other metrics, appears to run but returns the below errors and returns 'nan' for results

In [64]:
# start of testing to try and get rag query engine for evaluate
def generate_responses(query_engine, test_questions, test_answers):
  responses = [query_engine.query(q) for q in test_questions]

  answers = []
  contexts = []
  for r in responses:
    answers.append(r.response)
    contexts.append([c.node.get_content() for c in r.source_nodes])
  dataset_dict = {
        "question": test_questions,
        "answer": answers,
        "contexts": contexts,
  }
  if test_answers is not None:
    dataset_dict["ground_truth"] = test_answers
  ds = Dataset.from_dict(dataset_dict)
  return ds

test_questions = testset_pd['question'].values.tolist()
test_answers = [[item] for item in testset_pd['answer'].values.tolist()]

result_ds = generate_responses(query_engine, test_questions, test_answers)

In [None]:
# Note: This evaluate function that uses the query_engine does not return results (nan for all metrics)
# Errors (below are repeated many times):
# WARNING:ragas.llms.base:n values greater than 1 not support for LlamaIndex LLMs
# n values greater than 1 not support for LlamaIndex LLMs
# INFO:ragas.llms.base:callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# callbacks not supported for LlamaIndex LLMs, ignoring callbacks
# ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
# Exception raised in Job[5]: TimeoutError()
# ERROR:ragas.executor:Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')
# Exception raised in Job[19]: AttributeError('ChatGoogleGenerativeAI' object has no attribute 'acomplete')

from ragas.integrations.llama_index import evaluate

eval_qe2 = evaluate(
    query_engine=query_engine,
    dataset=result_ds,
    metrics=[faithfulness,
    answer_relevancy,
    context_utilization],
    llm=ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [63]:
eval_qe2

{'faithfulness': nan, 'answer_relevancy': nan, 'context_utilization': nan}