In [1]:
# Tutorial for setting up a small RAG system using Faiss 
# and evaluating it using the Gemini Flash 1.5 LLM and the ARES library
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# ARES: https://github.com/stanford-futuredata/ARES

# ARES should work with with several common LLMs (current support for API keys from OpenAI, Anthropic, and Together) as well as vllm
# I was able to get it working with Google Gemini (not currently supported by the library) with moderate effort 
# in adapting the source code for Gemini and installing locally 
# See ARES_files folder for full files changed; main one was RAG_Automatic_Evaluation/Evaluation_Functions.py with adapted API specific functions

# Metrics available in ARES:   
# Context Relevance: Determines if the retrieved information is pertinent to the query.
# Answer Faithfulness: Checks if the response generated by the language model is properly grounded in the retrieved context and does not include hallucinated or extraneous information.
# Answer Relevance: Evaluates whether the generated response is relevant to the query, addressing all aspects of the question appropriately.

In [2]:
# Set up environment

In [1]:
import os
from IPython.display import display, Markdown
import pandas as pd

import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore

import faiss

In [2]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
# Set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [None]:
# Establish RAG pipeline with Gemini

In [5]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# # Example of creating a small vector store
# # Using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, edited to include a title with the date of the speech
# # Example from 2024:
# # https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/

# # load and parse files
# sotu = []
# newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
# for i in newfiles:
#     with open(i) as file:
#         for line in file:
#             nl = line.rstrip()
#             if nl != '':
#                 sotu.append(nl)

# # convert into Document format
# documents = [Document(text=line) for line in sotu]

In [7]:
# # Example of a loaded Document line
# documents[-1]

Document(id_='4a26868c-5225-4612-9338-005a366b96e5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [4]:
# # Set up the faiss index
# d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
# faiss_index = faiss.IndexFlatL2(d)
# print(faiss_index.is_trained) # double check that the training worked

True


In [5]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Can substitute any LangChain Chat Model
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # Can substitute any LangChain embedding model 
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [12]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, show_progress=True
# )

# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# index.index_id

In [7]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the 4 speeches including a title that includes the date it was given
index = load_index_from_storage(storage_context=storage_context, index_id='3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5')

In [8]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
# query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [13]:
# # Example query and response with Gemini and query_engine
# query = "What does the President say about his administration's first 100 days and covid-19?"
# response = query_engine.query(query) 
# print(response.response)

In [21]:
# Code for ARES RAG evaluation library to work with Gemini and our local RAG setup
# ARES has options to run a traditional metrics evaluation (their UES/IDP function for context relevance, answer relevance, and answer faithfulness)
# or a Prediction Powered Inference evaluation to generate a confidence interval for a given metric
# ARES can also synthetically generate data (Queries/Answers/Context) from specified documents 

In [13]:
from ares import ARES

vLLM not imported.
vLLM not imported.
vLLM not imported.
vLLM not imported.


In [16]:
# This configuration runs an evaluation on an unlabeled evaluation set (UES) in conjunction with in-domain prompts (IDP) 
ues_idp_config = {
    "in_domain_prompts_dataset": "ARES_files/nq_few_shot_prompt_for_judge_scoring.tsv", # Small set of labeled few-shot examples for scoring context relevance, answer faithfulness, and/or answer relevance in your RAG system
    "unlabeled_evaluation_set": "ARES_files/nq_unlabeled_output.tsv", # Larger set of unlabeled query-document-answer triples output by your RAG system for scoring
    "model_choice" : "models/gemini-1.5-flash",
    "request_delay" : 60, # Delay (in seconds) between requests to the API
    "documents" : 0 # Number of documents to be evaluated. Default is 0, which means all documents in the evaluation set will be evaluated
} 

ares = ARES(ues_idp=ues_idp_config)

In [15]:
results = ares.ues_idp()
print(results)

Evaluating large subset with models/gemini-1.5-flash:   0%|          | 0/3 [00:00<?, ?it/s]

configured gemini for context relevance
Testing candidates
test_response
[[Yes]]

configured gemini for answer relevance
Testing response.text
[[Yes]]

configured gemini for answer faithfulness
Testing response.text
[[Yes]]

configured gemini for context relevance
Testing candidates
test_response
[[Yes]]

configured gemini for answer relevance
Testing response.text
[[Yes]]

configured gemini for answer faithfulness
Testing response.text
[[Yes]]

configured gemini for context relevance
Testing candidates
Attempt 1 failed with error: list index out of range
Testing candidates
Attempt 2 failed with error: list index out of range
Testing candidates
Attempt 3 failed with error: list index out of range
Testing candidates
Attempt 4 failed with error: list index out of range
Testing candidates
All attempts failed. Last error was: list index out of range
Number of times did not extract Yes or No: 0
{'Context Relevance Scores': 0.667, 'Answer Faithfulness Scores': 0.667, 'Answer Relevance Scores

In [18]:
results

{'Context Relevance Scores': 0.667,
 'Answer Faithfulness Scores': 0.667,
 'Answer Relevance Scores': 0.667}

In [14]:
# According to ARES, 'document' refers to a line in a document_filepath_file
# I never tested it with the speeches broken up line by line/by paragraph/by sentence
# This setup of 1 large document per line in the document file did not work well; recommend trying documents being only a few sentences (or so) each
synth_config = { 
    "document_filepaths": ["Speeches/titleedits/Speeches_Docs_ARES.tsv"], # Source documents for generating synthetic queries; requires tsv file
    "few_shot_prompt_filename": "datasets/manual_dataset_complete_ares_synthetic.tsv", # Few shot labeled training data with queries and answers
    "synthetic_queries_filenames": ["results/synthetic_dataset_output_ARES.tsv"], # Where to save resulting synthetic data file
    "model_choice": "models/gemini-1.5-flash", 
    "documents_sampled": 1000, # How many documents to sample when generating synthetic queries; ARES filters documents < 50 words
    "api_model": True
}

ares_synth = ARES(synthetic_query_generator=synth_config)

In [None]:
results = ares_synth.generate_synthetic_data()
print(results)

In [14]:
# Prediction Powered Inference - enhances the evaluation accuracy and estimates a confidence interval for the quality of each RAG system
ppi_config = { 
    "evaluation_datasets": ['datasets/unlabeled_dataset/unlabeled_dataset.tsv'], # Unlabeled data for evaluation 
    "few_shot_examples_filepath": "datasets/manual_dataset_complete_ares.tsv", # A few full labeled examples to show the labeling schema and guide the evaluation 
    #"num_trials" : 10, # Number of iterations used to estimate confidence intervals and other statistics utilized in PPI; Example given was 1000
    "llm_judge": "models/gemini-1.5-flash", # LLM to use for evaluation; specify only if there is no checkpoint
    "labels": ["Context_Relevance_Label"], # Which metric(s) in labeled dataset to evaluate
    "gold_label_paths": ["datasets/labeled_dataset/labeled_dataset_sotu.tsv"] # A fully labeled validation dataset with 50+ examples different than few shot dataset; used to measure performance of the classifier
    #"checkpoints": ["None"], # If created, file is a saved state of the trained classifier used for evalution; Use either checkpoint or llm_judge
    #"alpha": 0.05, # Significance level for hypothesis testing and confidence interval; Default is 0.05
    #"request_delay": 0 # Optional, not tested; Specifies the delay in seconds between each request to the LLM API
}

ares_ppi = ARES(ppi=ppi_config)

In [None]:
results = ares_ppi.evaluate_RAG()
print(results)