In [1]:
# RAG with Gemini Flash 1.5 LLM and DeepEval evaluation
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# DeepEval: https://docs.confident-ai.com/docs/guides-rag-evaluation
# faithfulness

In [2]:
# Establish RAG pipeline with Gemini

In [3]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from IPython.display import display
from IPython.display import Markdown
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss
import instructor
import deepeval

In [4]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [5]:
deepeval.telemetry_opt_out()

True

In [6]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [7]:
# set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [8]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [9]:
# create document database
# using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online
# Example from 2024:
# https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/
sotu = []
newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
for i in newfiles:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [10]:
documents = [Document(text=line) for line in sotu]

In [11]:
# Example of a loaded Document line
documents[-1]

Document(id_='12476fb7-47ee-41a0-a4da-e40dbfcfea37', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [12]:
# Set up the faiss index
d = 768 # dimensions of ___, the embedding model that we're going to use
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [13]:
# set up the embeddings
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
Settings.llm = llm

In [14]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)

In [15]:
# save index to disk
#index.storage_context.persist()
#index

In [16]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# index id 'cef7ae30-ff1e-404a-bce6-85d59ca4b376' uses the speeches with a title that includes the date it was given
index = load_index_from_storage(storage_context=storage_context, index_id='cef7ae30-ff1e-404a-bce6-85d59ca4b376')

In [17]:
# set up query and chat engines
query_engine = index.as_query_engine(similarity_top_k=10)
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [55]:
# Example query and response
query = "In detail, what has the President done to improve the economy over the four years of his speeches?"
response = query_engine.query(query)

In [56]:
print(response.response)

The President highlights the creation of over 1.3 million jobs in the first 100 days of his term, a record 12 million jobs created in two years, and a strong economic growth rate of 5.7% in the previous year. He also emphasizes the International Monetary Fund's prediction of an economic growth rate exceeding 6% for the current year. 



In [18]:
# Start of DeepEval implementation, following their guide for RAG
# https://docs.confident-ai.com/docs/guides-rag-evaluation

In [19]:
from pydantic import BaseModel
from deepeval.models import DeepEvalBaseLLM

In [20]:
# DeepEval requires a json response... In practice, this has led to errors, even with as simple of a schema as this
class Response(BaseModel):
    response: str

In [21]:
# Non Open-AI requieres a custom LLM class for using DeepEval
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [22]:
# similarly, a custom embedding model class is required for non Open-AI embeddings
from typing import List, Optional
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from deepeval.models import DeepEvalBaseEmbeddingModel

class CustomGeminiEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        pass

    def load_model(self):
        return GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom Gemini Embeddings"

In [23]:
custom_geminiflash = CustomGeminiFlash()
custom_geminiembeddings = CustomGeminiEmbeddingModel()

In [22]:
# test example for using the custom LLM model class
# test = custom_geminiflash.generate(prompt="How many different types of clouds are there?", schema=Response)
# test.response

In [25]:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

test_case = LLMTestCase(
    input="What measures does the speaker propose to lower prescription drug costs in America?",
    actual_output="The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade.s",
    expected_output="The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.",
    retrieval_context=['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)', 'In fact, we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug, nearly three times what other countries pay.  We have to change that, and we can.', 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.', 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.', 'And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.', 'Now I want to cap prescription drug costs at $2,000 a year for everyone!', 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.', 'Make no mistake, if you try to do anything to raise the cost of prescription drugs, I will veto it.', 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.', 'It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']
)

In [29]:
# Example of measuring metrics individually for one test_case

# Retrieval metrics:
contextual_precision.measure(test_case)
print("Contextual Precision Score: ", contextual_precision.score)
print("Contextual Precision Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Contextual Recall Score: ", contextual_recall.score)
print("Contextual Recall Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Contextual Relevancy Score: ", contextual_relevancy.score)
print("Contextual Relevancy Reason: ", contextual_relevancy.reason)

Output()

Output()

Contextual Precision Score:  0.9095238095238096
Contextual Precision Reason:  The score is 0.91 because the first five nodes in the retrieval context are relevant, while the last four are not. The 'no' verdicts should be ranked lower as they do not explicitly propose measures to lower prescription drug costs, with the sixth node focusing on a consequence instead of a measure, the seventh node mentioning the high cost but not offering a solution, and the eighth and ninth nodes mentioning opposition to raising costs and the potential savings but not a specific measure.


Output()

Contextual Recall Score:  1.0
Contextual Recall Reason:  The score is 1.00 because the speaker proposes Medicare to negotiate lower drug prices and this is supported by the first node in the retrieval context.  This node directly discusses Medicare's power to negotiate lower drug prices and the potential savings.


Contextual Relevancy Score:  0.4
Contextual Relevancy Reason:  The score is 0.4 because the context mentions lowering drug costs but doesn't mention any "specific measures" to achieve that, as per the reasons for irrelevancy. For example, "The context discusses the high prescription drug prices in America compared to other countries, but it doesn't mention any specific measures to lower costs."


In [30]:
# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)
                                 
answer_relevancy.measure(test_case)
print("Answer Relevancy Score: ", answer_relevancy.score)
print("Answer Relevancy Reason: ", answer_relevancy.reason)

faithfulness.measure(test_case)
print("Faithfulness Score: ", faithfulness.score)
print("Faithfulness Reason: ", faithfulness.reason)

Output()

Output()

Answer Relevancy Score:  1.0
Answer Relevancy Reason:  The score is 1.00 because the output provides the perfect JSON response based on the given schema, which is exactly what is needed.


Faithfulness Score:  1.0
Faithfulness Reason:  Amazing! You've got no contradictions, which means the actual output perfectly aligns with the retrieval context. Keep up the great work!


In [26]:
# Example of measuring metrics in bulk for multiple test_cases / a full dataset

from deepeval.dataset import EvaluationDataset

# Load manually curated dataset
evaldataset = EvaluationDataset()
evaldataset.add_test_cases_from_csv_file(
    file_path="datasets/manual_dataset_complete.csv",
    input_col_name="Input",
    actual_output_col_name="Actual_Output",
    expected_output_col_name="Expected_Output",
#    context_col_name="context",
#    context_col_delimiter= ",",
    retrieval_context_col_name="Retrieval_Context",
    retrieval_context_col_delimiter= ","
#    additional_metadata_col_name="source_file"
)

In [99]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

# For RAG systems, DeepEval recommends the following metrics:
# Retriever metrics:
contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash) # this was the only metric that would not work on the manually curated dataset (429 errors)

# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)

In [109]:
# 2 Options for Metrics Evaluation: 

# 1) Iterating through test cases seems to work better than bulk evaluation with evaluate,
# as errors encountered with evaluate(...) cause no results to be returned
# Looping at least saves partial results until an error occurs
# Encountered this sometimes with contextual_relevancy and contextual_precision on the test dataset (429 errors or Invalid JSON errors),
# yet typically was fine if iterated through individual test_cases
# For future: https://github.com/confident-ai/deepeval/issues/964 may assist with incorrect json errors like what was being returned

# Example for faithfulness metric
# faithfulness_results = []
# for i in range(len(evaldataset.test_cases)):
#     eval_faithfulness = evaluate(test_cases=[evaldataset.test_cases[i]], metrics=[faithfulness], throttle_value=90)
#     faithfulness_results.append(eval_faithfulness[0])

# 2) Evaluate through test_cases in bulk; In testing, at least faithfulness, contextual_precision metrics worked this way with the manually curated dataset

# bulk evaluation of test_cases; throttle_value is for rate limiting- in seconds between queries
test_precision = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_precision], throttle_value=90)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |                                   |  0% (0/1) [Time Taken: 01:30, ?test case/s]


InstructorRetryException: RetryError[<Future at 0x7f7b803d48d0 state=finished raised ResourceExhausted>]

In [103]:
from deepeval.evaluate import TestResult
from deepeval.evaluate import print_test_result

# quick print for results after evaluation, as needed
for i in [test_precision]:
    for j in i:
        if type(j) == TestResult:
            print_test_result(j)
        else:
            print_test_result(j[0])



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 1.00 because all relevant nodes are ranked higher than the irrelevant node, which is the last node. The last node is irrelevant because it's a general statement about presidential speeches, not about economic growth. All other nodes are relevant because they either explicitly state or imply the President's plans to grow the economy, and provide examples of economic successes. For instance, the first node states that the economy created more than 1,300,000 new jobs in 100 days, which indicates growth, and the second node quotes the President saying 'trickle-down economics has never worked and it’s time to grow the economy from the bottom and the middle out,' which directly supports the input., error: None)

For test case:

  - input: How does the President plan to grow the economy in his first speech, and what are examples of his successes in

In [31]:
# test RAGAS metrics
from deepeval import evaluate
from deepeval.metrics.ragas import RagasMetric
from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric
from deepeval.metrics.ragas import RAGASFaithfulnessMetric
from deepeval.metrics.ragas import RAGASContextualRecallMetric
from deepeval.metrics.ragas import RAGASContextualPrecisionMetric

ragasmetric = RagasMetric(model=custom_geminiflash, embeddings=custom_geminiembeddings)
ragas_ar = RAGASAnswerRelevancyMetric(model=custom_geminiflash, embeddings=custom_geminiembeddings)
ragas_f = RAGASFaithfulnessMetric(model=custom_geminiflash)
ragas_cr = RAGASContextualRecallMetric(model=custom_geminiflash)
ragas_cp = RAGASContextualPrecisionMetric(model=custom_geminiflash)

In [None]:
# example of evaluation for all ragas metrics
eval_ragas = evaluate(test_cases=evaldataset.test_cases, metrics=[ragasmetric], throttle_value=90)

In [None]:
# example of evaluation for each ragas metric individually
eval_ragas_all = evaluate(test_cases=evaldataset.test_cases, metrics=[ragas_ar, ragas_f, ragas_cr, ragas_cp], throttle_value=90)

In [46]:
# Generate a synthetic dataset of "Goldens" (aka 'input', 'context', 'source_file' columns -- not 'Retrieval_Context') with DeepEval
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer

dataset = EvaluationDataset()
synthesizer = Synthesizer(model=custom_geminiflash, embedder=custom_geminiembeddings)
dataset.generate_goldens_from_docs(
    synthesizer=synthesizer,
    document_paths=['Speeches/titleedits/state_of_the_union_042921.txt', 'Speeches/titleedits/state_of_the_union_030122.txt', 
                    'Speeches/titleedits/state_of_the_union_020723.txt', 'Speeches/titleedits/state_of_the_union_030724.txt'],
    max_goldens_per_document=3
)

dataset.save_as(file_type="csv", directory=".")

Output()

In [37]:
# Additional ways to form json response

In [36]:
class User(BaseModel):
    name: str
    age: int

In [61]:
assert isinstance(test, User)
assert test.resp == "Tiki"
assert test.age == 25