In [1]:
# Tutorial for setting up a small RAG system using Faiss 
# and evaluating it using the Gemini Flash 1.5 LLM and the DeepEval library
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# DeepEval: https://docs.confident-ai.com/docs/guides-rag-evaluation

# DeepEval v1.1.6 was fairly compatible with Google Gemini by creating a new LLM class that inherited from DeepEvalBaseLLM
# and adding methods that called Gemini's generation functions; it was a similar setup for the Embeddings, inheriting from DeepEvalBaseEmbeddingModel
# The only trick is that the LLM output needs to be in JSON format
# I used the pydantic and instructor libraries for this; the following gives good examples of how to use them
# Tutorial on using custom LLMs with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-llms

# Metrics available in DeepEval:
# Contextual Precision: Evaluates whether the reranker in your retriever ranks more relevant nodes in your retrieval context higher than irrelevant ones.
# Contextual Recall: Evaluates whether the embedding model in your retriever is able to accurately capture and retrieve relevant information based on the context of the input.
# Contextual Relevance: Evaluates whether the text chunk size and top-K of your retriever is able to retrieve information without much irrelevancies.
# Answer Relevancy: Evaluates whether the prompt template in your generator is able to instruct your LLM to output relevant and helpful outputs based on the retrieval_context.
# Faithfulness: Evaluates whether the LLM used in your generator can output information that does not hallucinate AND contradict any factual information presented in the retrieval_context.
# Other metrics are available for non-RAG systems; custom metrics can also be created (I did not test this)

In [2]:
# Set up environment

In [None]:
import os
from IPython.display import display, Markdown
import pandas as pd
from typing import List
from pydantic import BaseModel, ConfigDict # for JSON output from DeepEval
import instructor # for JSON output from DeepEval

# Replace these two Google Gemini imports with imports for your LLM
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

import deepeval
from deepeval.models import DeepEvalBaseLLM, DeepEvalBaseEmbeddingModel
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer
from deepeval import evaluate
from deepeval.evaluate import TestResult, print_test_result
from deepeval.metrics import (
    AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    FaithfulnessMetric
)
from deepeval.metrics.ragas import (
    RagasMetric,
    RAGASAnswerRelevancyMetric,
    RAGASFaithfulnessMetric, 
    RAGASContextualRecallMetric,
    RAGASContextualPrecisionMetric,
    RAGASContextualRelevancyMetric
) 

In [2]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [3]:
deepeval.telemetry_opt_out()

True

In [4]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [5]:
# set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [6]:
# Establish RAG pipeline with Gemini

In [7]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# # Example of creating a small vector store
# # Using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, edited to include a title with the date of the speech
# # Example from 2024:
# # https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/

# # load and parse files
# sotu = []
# newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
# for i in newfiles:
#     with open(i) as file:
#         for line in file:
#             nl = line.rstrip()
#             if nl != '':
#                 sotu.append(nl)

# # convert into Document format
# documents = [Document(text=line) for line in sotu]

In [9]:
# # Example of a loaded Document line
# documents[-1]

Document(id_='833ea164-b547-46ec-8854-cefdc83fbb10', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [10]:
# # Set up the faiss index
# d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
# faiss_index = faiss.IndexFlatL2(d)
# print(faiss_index.is_trained) # double check that the training worked

True


In [11]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Can substitute any LangChain Chat Model
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # Can substitute any LangChain embedding model 
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [13]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, show_progress=True
# )

# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# index.index_id

In [12]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' uses the 4 speeches including a title that includes the date it was given
index = load_index_from_storage(storage_context=storage_context, index_id='3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5')

In [13]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
# query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [18]:
# # Example query and response
# query = "In detail, what has the President done to improve the economy over the four years of his speeches?"
# response = query_engine.query(query) 
# print(response.response)

The President has highlighted several key achievements in improving the economy during his time in office. These include:

* **Job creation:** He boasts of creating over 1.3 million jobs in his first 100 days, surpassing any previous president.  He also mentions creating a record 12 million new jobs in his first two years, exceeding the total job creation of any president in a four-year term.
* **Economic growth:** He points to a 5.7% economic growth rate last year, the strongest in nearly 40 years.  He also highlights the International Monetary Fund's prediction of over 6% growth this year, which would be the fastest pace in decades.
* **Shifting economic focus:** He emphasizes a move away from "trickle-down economics" and towards growing the economy from "the bottom and the middle out."

These achievements are presented as evidence of a significant turnaround from the economic crisis he inherited. 



In [17]:
# # Example of setting up a chat engine with our index
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')
# query = "You are an expert on analyzing Presidential State of the Union speeches. Could you please analyze the speeches and generate 2 questions and answers from each speech, providing the document filename of each speech that relates to each question?"
# response = chat_engine.chat(query) 
# print(response.response)

In [12]:
# Code for ARES RAG evaluation library to work with Gemini and our local RAG setup

# ARES has options to run a traditional metrics evaluation (their UES/IDP function for context relevance, answer relevance, and answer faithfulness)
# or a Prediction Powered Inference evaluation to generate a confidence interval for a given metric
# ARES can also synthetically generate data (Queries/Answers/Context) from specified documents 

# Code for DeepEval RAG evaluation library to work with Gemini and our local RAG setup
# In DeepEval v1.1.6, an example Input/Output/Context is a 'LLMTestCase'
# You can evaluate LLMTestCases individually or in a large batch with the evaluate function
# Metric scores are produced as well as an LLM generated explanation for a given score 
# DeepEval can also synthetically generate data from documents - see Synthesizer and generate_goldens_from_docs, below. 
# Of the 3 libraries I tested, this function produced the most human-realistic queries.
# DeepEval also offers other functionality besides RAG evaluation, including "red teaming LLM applications for security vulnerabilities"

# https://docs.confident-ai.com/docs/guides-rag-evaluation

In [14]:
# DeepEval requires a json response. In practice, this has led to malformed json returned from the llm, even with as simple of a schema as this, 
# but this and the LLM class can likely be refined to improve responses
class Response(BaseModel):
    response: str

In [15]:
# Non Open-AI usage requires a custom LLM class for using DeepEval
# Tutorial with example code on using custom LLMs with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-llms
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")
        model_config  = ConfigDict(protected_namespaces=())

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [16]:
# Similarly, a custom embedding model class is required for non Open-AI embeddings
# Tutorial on using custom embeddings with DeepEval: https://docs.confident-ai.com/docs/guides-using-custom-embedding-models 
class CustomGeminiEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        model_config  = ConfigDict(protected_namespaces=())

    def load_model(self):
        return GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom Gemini Embeddings"

In [17]:
# Create custom llm and embeddings
custom_geminiflash = CustomGeminiFlash()
custom_geminiembeddings = CustomGeminiEmbeddingModel()

In [18]:
# Example of evaluating one test case
contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

test_case = LLMTestCase(
    input="What measures does the speaker propose to lower prescription drug costs in America?",
    actual_output="The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade.s",
    expected_output="The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.",
    retrieval_context=['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)', 'In fact, we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug, nearly three times what other countries pay.  We have to change that, and we can.', 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.', 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.', 'And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.', 'Now I want to cap prescription drug costs at $2,000 a year for everyone!', 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.', 'Make no mistake, if you try to do anything to raise the cost of prescription drugs, I will veto it.', 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.', 'It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']
)

In [23]:
# For RAG systems, DeepEval recommends the following Retrieval and Generation metrics:
# Retrieval metrics:
contextual_precision.measure(test_case)
print("Contextual Precision Score: ", contextual_precision.score)
print("Contextual Precision Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Contextual Recall Score: ", contextual_recall.score)
print("Contextual Recall Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Contextual Relevancy Score: ", contextual_relevancy.score)
print("Contextual Relevancy Reason: ", contextual_relevancy.reason)

Output()

Output()

Contextual Precision Score:  0.9095238095238096
Contextual Precision Reason:  The score is 0.91 because the first five nodes are relevant and directly address the speaker's proposal to lower drug prices through Medicare negotiation.  However, the sixth node, focusing on capping prescription drug costs,  is a separate proposal and doesn't directly align with the initial proposal, making it a relevant node ranked lower. The seventh and eighth nodes don't mention specific measures, making them less relevant compared to the first five nodes that directly discuss the speaker's proposed measure.


Output()

Contextual Recall Score:  1.0
Contextual Recall Reason:  The score is 1.00 because the speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices, and the node(s) in retrieval context) support this with phrases like 'Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.'.


Contextual Relevancy Score:  0.5
Contextual Relevancy Reason:  The score is 0.50 because the reasons for irrelevancy indicate that the context doesn't specifically outline measures to lower prescription drug costs, but it does discuss the speaker's position and impact on costs. "The context discusses the high prescription drug prices in America and suggests changing them, but it doesn't mention any specific measures the speaker proposes to lower costs." and "The context only states the speaker's position on raising the cost of prescription drugs, not any measures to lower it." highlight this, suggesting a partial relevance as the context doesn't ignore the topic entirely.


In [29]:
# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)
                                 
answer_relevancy.measure(test_case)
print("Answer Relevancy Score: ", answer_relevancy.score)
print("Answer Relevancy Reason: ", answer_relevancy.reason)

faithfulness.measure(test_case)
print("Faithfulness Score: ", faithfulness.score)
print("Faithfulness Reason: ", faithfulness.reason)

Output()

Output()

Answer Relevancy Score:  1.0
Answer Relevancy Reason:  The score is 1.00 because the input asks for measures to lower prescription drug costs and the provided JSON correctly represents that information. Great job!


Faithfulness Score:  1.0
Faithfulness Reason:  The score is 1.00 because there are no contradictions, this is great!


In [19]:
# Example of measuring metrics for multiple test cases / a full dataset

# Load manually curated dataset
evaldataset = EvaluationDataset()
evaldataset.add_test_cases_from_csv_file(
    file_path="datasets/unlabeled_dataset/unlabeled_dataset_deepeval.csv",
    input_col_name="Input",
    actual_output_col_name="Actual_Output",
    expected_output_col_name="Expected_Output",
#    context_col_name="context",
#    context_col_delimiter= ",",
    retrieval_context_col_name="Retrieval_Context",
    retrieval_context_col_delimiter= ","
#    additional_metadata_col_name="source_file"
)

In [20]:
# Retriever metrics:
contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash) # Note: this was the only metric that would not finish execution for the manually curated dataset (too many 429 errors)

# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)

In [None]:
# 2 Options for Metrics Evaluation: 

# 1) Iterating through test cases seems to work better than bulk evaluation with evaluate,
# as errors encountered with evaluate(...) cause no results to be returned
# Looping at least saves partial results until an error occurs
# I encountered this sometimes with contextual_relevancy and contextual_precision on the test dataset (429 errors or Invalid JSON errors),
# yet typically I got results if I iterated through individual test_cases
# For future: https://github.com/confident-ai/deepeval/issues/964 may assist with incorrect json errors like what was being returned

# Example for evaluating one test case at a time
contextprecision_results = []
for i in range(0, len(evaldataset.test_cases)):
    eval_contextprecision = evaluate(test_cases=[evaldataset.test_cases[i]], metrics=[contextual_precision], throttle_value=10) #throttle_value is for rate limiting, in seconds between queries
    contextprecision_results.append(eval_contextprecision[0])

# 2) Evaluate through test_cases in bulk
# In testing, at least faithfulness, contextual_precision metrics worked this way with a small, manually curated dataset

# Throttle_value is for rate limiting- in seconds between queries
# test_precision = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_precision], throttle_value=90)

In [27]:
# Quick print for results after evaluation of single test cases, as needed
# Get scores to calculate average score
scores = []
for i in [contextprecision_results]:
    for j in i:
        if type(j) == TestResult:
            scores.append(j.metrics_data[0].score)
            print_test_result(j)
        else:
            print_test_result(j[0])

In [29]:
# Calculate the average for the metric
scoredata = pd.DataFrame(scores, index=None)
scoredata.mean()

0    0.833162
dtype: float64

In [73]:
# Save the scores
scoredata.to_csv("results/deepeval_contextprecision_unlabeled.csv", index=False)

In [82]:
# Example output from DeepEval
for i in [contextprecision_results]:
    for j in i[0:5]:
        print_test_result(j)



Metrics Summary

  - ✅ Contextual Precision (score: 0.8928571428571428, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.89 because the first three nodes in the retrieval context strongly support the input by mentioning key details like the Selma march, Edmund Pettus Bridge, voting rights, and the Voting Rights Act.  However, the fourth node, mentioning John Lewis, is only loosely relevant; the fifth node is irrelevant as it discusses President Roosevelt; and the sixth node about January 6th is entirely unrelated.  While the seventh node again strengthens relevance by mentioning John Lewis, the presence of these three irrelevant nodes, especially the completely unrelated node about January 6th ranked lower than most relevant nodes, lowers the contextual precision score., error: None)

For test case:

  - input: What was the significance of the Selma March in Alabama, where hundreds of civil rights activists marched across the Edmund Pettus Bri

In [26]:
# DeepEval also has the RAGAS metrics available for evaluation

# Unforuntately the RAGAS metrics in DeepEval only accept langChain chat models, so the Gemini DeepEvalBaseLLM class will not work with these metrics
# Need to use our LangChain LLM created earlier:  
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
# doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") 

ragasmetric = RagasMetric(model=llm, embeddings=doc_embeddings)
ragas_ar = RAGASAnswerRelevancyMetric(model=llm, embeddings=doc_embeddings)
ragas_f = RAGASFaithfulnessMetric(model=llm)
ragas_crecall = RAGASContextualRecallMetric(model=llm)
ragas_cp = RAGASContextualPrecisionMetric(model=llm)
ragas_crel = RAGASContextualRelevancyMetric(model=llm) # Note: This metric did not work in testing; returned errors related to the model

In [None]:
# Example of evaluation for RagasMetric, an average of RAGAS's Answer Relevancy, Faithfulness, Contextual Recall, and Contextual Precision metrics
eval_ragas = evaluate(test_cases=[evaldataset.test_cases[0]], metrics=[ragasmetric], throttle_value=90)

In [None]:
# Example of evaluation for each ragas metric individually
eval_ragas_f = evaluate(test_cases=evaldataset.test_cases, metrics=[ragas_f], throttle_value=90)

In [None]:
# Generate a synthetic dataset of "Goldens" (aka a dataset with 'input', 'context', 'source_file' columns -- not 'Retrieval_Context') with DeepEval
dataset = EvaluationDataset()
synthesizer = Synthesizer(model=custom_geminiflash, embedder=custom_geminiembeddings)
dataset.generate_goldens_from_docs(
    synthesizer=synthesizer,
    document_paths=['Speeches/titleedits/state_of_the_union_042921.txt', 'Speeches/titleedits/state_of_the_union_030122.txt', 
                    'Speeches/titleedits/state_of_the_union_020723.txt', 'Speeches/titleedits/state_of_the_union_030724.txt'],
    max_goldens_per_document=3,
    include_expected_output=True
)

dataset.save_as(file_type="csv", directory=".")

In [18]:
unlabeled_dataset = pd.read_csv('datasets/unlabeled_dataset/unlabeled_dataset.csv', index_col=None)

In [21]:
unlabeled_dataset = unlabeled_dataset.rename(columns={"Query":"Input", "Answer":"Actual_Output", "Document":"Retrieval_Context"})

In [22]:
unlabeled_dataset.to_csv("datasets/unlabeled_dataset/unlabeled_dataset_deepeval.csv", index=False)