In [1]:
# RAG with Gemini Flash 1.5 LLM and DeepEval evaluation
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# DeepEval: https://docs.confident-ai.com/docs/guides-rag-evaluation

In [2]:
# Establish RAG pipeline

In [3]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from IPython.display import display
from IPython.display import Markdown
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss
import instructor
import deepeval



In [4]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [5]:
deepeval.telemetry_opt_out()

True

In [6]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [7]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [8]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [9]:
# create document database
# using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online
# Example from 2024:
# https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/
sotu = []
files = ["./Speeches/state_of_the_union_042921.txt", "./Speeches/state_of_the_union_030122.txt", "./Speeches/state_of_the_union_020723.txt", "./Speeches/state_of_the_union_030724.txt"]
for i in files:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [10]:
documents = [Document(text=line) for line in sotu]

In [12]:
# Example of a loaded Document line
documents[-1]

Document(id_='62482d96-1351-4486-9125-dffce3bb02e2', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [13]:
# Set up the faiss index
d = 768 # dimensions of ___, the embedding model that we're going to use
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [14]:
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
Settings.llm = llm

In [15]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)
## save index to disk
#index.storage_context.persist()
#index

In [16]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

In [17]:
query_engine = index.as_query_engine(similarity_top_k=10)
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [None]:
# Example query and response
query = "What has the President done related to healthcare?"
response = query_engine.query(query)

In [19]:
print(response.response)

The President has enacted several initiatives related to healthcare, including establishing a special sign-up period for the Affordable Care Act, enacting tax credits to reduce health care premiums, and re-igniting the Cancer Moonshot. 



In [18]:
# Start of DeepEval implementation
# https://docs.confident-ai.com/docs/guides-rag-evaluation

In [19]:
from pydantic import BaseModel
from deepeval.models import DeepEvalBaseLLM

In [20]:
class Response(BaseModel):
    response: str

In [21]:
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [22]:
custom_geminiflash = CustomGeminiFlash()

In [33]:
test = custom_geminiflash.generate(prompt="How many different types of clouds are there?", schema=Response)

In [35]:
test.response

'There are many different ways to classify clouds, but generally, there are ten main types of clouds.  These are further divided into three groups based on altitude: high clouds, middle clouds, and low clouds.  There are also clouds that have vertical development and can extend from low to high altitudes.'

In [23]:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

def test_answer_relevancy():
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=custom_geminiflash)
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        # Replace this with the actual output of your LLM application
        actual_output="We offer a 30-day full refund at no extra cost."
    )
    assert_test(test_case, [answer_relevancy_metric])

In [24]:
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

test_case = LLMTestCase(
    input="What measures does the speaker propose to lower prescription drug costs in America?",
    actual_output="The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade.s",
    expected_output="The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.",
    retrieval_context=['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)', 'In fact, we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug, nearly three times what other countries pay.  We have to change that, and we can.', 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.', 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.', 'And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.', 'Now I want to cap prescription drug costs at $2,000 a year for everyone!', 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.', 'Make no mistake, if you try to do anything to raise the cost of prescription drugs, I will veto it.', 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.', 'It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']
)

In [25]:
# Evaluation of the retriever
contextual_precision.measure(test_case)
print("Contextual Precision Score: ", contextual_precision.score)
print("Contextual Precision Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Contextual Recall Score: ", contextual_recall.score)
print("Contextual Recall Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Contextual Relevancy Score: ", contextual_relevancy.score)
print("Contextual Relevancy Reason: ", contextual_relevancy.reason)

Output()

Output()

Contextual Precision Score:  0.9095238095238096
Contextual Precision Reason:  The score is 0.91 because the irrelevant nodes are ranked lower than the relevant nodes. The sixth node discusses capping costs, which is not the speaker's proposed solution. The seventh node acknowledges the issue but doesn't offer solutions. The eighth node states the speaker's opposition to raising prices but not their own proposals. All other nodes are relevant and are ranked higher than these three irrelevant nodes.


Output()

Contextual Recall Score:  0.4444444444444444
Contextual Recall Reason:  The score is 0.44 because the first sentence in the expected output is supported by the first node in the retrieval context, but the rest of the sentences are not supported by anything in the retrieval context.


Contextual Relevancy Score:  0.5
Contextual Relevancy Reason:  The score is 0.50 because the context touches on lowering drug costs, but lacks specific proposals, mentioning only the speaker's stance and impact on the deficit. "The context mentions that prescription drug prices in America are the highest in the world, but it does not propose any measures to lower these costs." and "The context discusses the impact of the policy on the federal deficit and Medicare spending, but it does not mention any specific measures to lower prescription drug costs in America." accurately describe this limitation.


In [None]:
#Contextual Precision Score:  0.9472222222222222
#Contextual Precision Reason:  The score is 0.95 because the 'no' verdicts are ranked lower than the 'yes' verdicts, since nodes 7 and 8 are irrelevant because node 7 discusses past attempts to address the issue but does not offer a solution, and node 8 focuses on the speaker's stance against any measures that might raise prescription drug costs, while the 'yes' verdicts provide information about how to lower drug costs, like node 1 mentioning "Medicare should be given the power to negotiate lower drug prescription prices". This is followed by other nodes, like node 2, which talks about paying "the highest prescription drug prices of anywhere in the world right here in America", while node 3 mentions "we’re finally giving Medicare the power to negotiate drug prices" and node 4 explains that "For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs". Node 5 states that "And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone". Node 6 proposes capping prescription drug costs at "$2,000 a year for everyone!". Node 9 proposes to "give Medicare the power to negotiate lower prices for 500 drugs over the next decade." Lastly, node 10 states that "It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare." Therefore, the score is 0.95 because the 'no' verdicts are ranked lower than the 'yes' verdicts.

#Contextual Recall Score:  1.0
#Contextual Recall Reason:  The score is 1.00 because all the sentences in the expected output are supported by node(s) in retrieval context, as they accurately reflect information presented in the retrieval context.  For example, the speaker's proposal to give Medicare the power to negotiate lower prescription drug prices is supported by multiple nodes in the retrieval context, and the potential savings and impact on drug costs for everyone is also accurately conveyed.  

#Contextual Relevancy Score:  0.4
#Contextual Relevancy Reason:  The score is 0.40 because the context discusses lowering prescription drug costs but doesn't elaborate on specific measures to achieve that, focusing instead on the speaker's stance on drug pricing. For example, the context states, "The context only mentions giving Medicare the power to negotiate drug prices, but doesn't elaborate on specific measures proposed to lower costs." and "The context only states that the speaker will veto anything that tries to raise the cost of prescription drugs, but it does not mention any measures to lower prescription drug costs."

# Rerun
#Score:  0.9472222222222222
#Reason:  The score is 0.95 because the relevant nodes are ranked higher than the irrelevant ones. For example, the first node directly mentions the speaker's proposal, making it highly relevant. The eighth node, however, only discusses a potential veto of legislation and is not directly related to measures to lower prescription drug costs. This makes the eighth node less relevant than the first, which is a reason why the contextual precision score is 0.95 and not higher.

#Score:  1.0
#Reason:  The score is 1.00 because all the information in the expected output is directly reflected in the retrieval context. The first node in the retrieval context contains multiple sentences that cover the core points of the expected output, such as giving Medicare power to negotiate lower drug prices, saving hundreds of billions of dollars, and lowering costs for everyone. The fourth node confirms that the speaker finally gave Medicare the power to negotiate lower prices, and the fifth node emphasizes that this will not only help Medicare recipients but also lower costs for everyone. The ninth node provides additional support by highlighting the impact on the federal deficit and taxpayers. Overall, the retrieval context perfectly matches the expected output with clear and consistent information.

#Score:  0.3
#Reason:  The score is 0.3 because the context discusses lowering prescription drug costs but lacks specific measures. The context only mentions giving Medicare the power to negotiate prices, but doesn't explain how this would be done or propose any other measures. As stated in the reasons, "The context mentions lowering prescription drug costs, but it doesn't provide any specific measures." and "The context only mentions giving Medicare the power to negotiate drug prices, but it does not elaborate on specific measures to lower prescription drug costs."

# Rerun
#Contextual Precision Score:  0.9095238095238096
#Contextual Precision Reason:  The score is 0.91 because the irrelevant nodes are ranked lower than the relevant nodes. The sixth node discusses capping costs, which is not the speaker's proposed solution. The seventh node acknowledges the issue but doesn't offer solutions. The eighth node states the speaker's opposition to raising prices but not their own proposals. All other nodes are relevant and are ranked higher than these three irrelevant nodes.

#Contextual Recall Score:  0.4444444444444444
#Contextual Recall Reason:  The score is 0.44 because the first sentence in the expected output is supported by the first node in the retrieval context, but the rest of the sentences are not supported by anything in the retrieval context.

#Contextual Relevancy Score:  0.5
#Contextual Relevancy Reason:  The score is 0.50 because the context touches on lowering drug costs, but lacks specific proposals, mentioning only the speaker's stance and impact on the deficit. "The context mentions that prescription drug prices in America are the highest in the world, but it does not propose any measures to lower these costs." and "The context discusses the impact of the policy on the federal deficit and Medicare spending, but it does not mention any specific measures to lower prescription drug costs in America." accurately describe this limitation.


In [26]:
# bulk eval
from deepeval import evaluate

#evaluate(
#    test_cases=[test_case],
#    metrics=[contextual_precision, contextual_recall, contextual_relevancy]
#)

In [31]:
from deepeval.dataset import EvaluationDataset

evaldataset = EvaluationDataset()
evaldataset.add_test_cases_from_csv_file(
    file_path="datasets/testset_answer_newcontext_flash_pro15.csv",
    input_col_name="question",
    actual_output_col_name="answer",
    expected_output_col_name="ground_truth",
#    context_col_name="contexts",
#    context_col_delimiter= ",",
    retrieval_context_col_name="contexts",
    retrieval_context_col_delimiter= ","
)

In [38]:
evaldataset.test_cases

[LLMTestCase(input='What does the author argue is under attack regarding reproductive rights and what steps does the author call for to address this issue? \n', actual_output='The author argues that the constitutional right to choose, established by Roe v. Wade, is under attack. The author calls for Congress to codify Roe v. Wade to protect this right and ensure access to healthcare, including reproductive healthcare. \n', expected_output="The author argues that the constitutional right to abortion, established by Roe v. Wade, is under attack. To address this issue, the author calls for protecting access to health care and preserving a woman's right to choose.", context=[], retrieval_context=["['The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.'", " 'Congress must restore the right the Supreme Court took away last year and codify Roe v. Wade to protect every woman’s constitutional right to choose.'", " 'If we want to

In [39]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric

hallucination_metric = HallucinationMetric(threshold=0.3, model=custom_geminiflash)
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=custom_geminiflash)

# You can also call the evaluate() function directly
# No I can't.
#evaldataset_results = evaluate(evaldataset, [hallucination_metric, answer_relevancy_metric])
# Calling the evaluate function on a EvaluationDataset only permits the metrics parameter, not any others like throttle_value... so the below results in ratelimiting errors
#evaldataset_results = evaldataset.evaluate(metrics=[hallucination_metric, answer_relevancy_metric]) 

# Alternatively...
evaldataset_results = evaluate(test_cases=evaldataset.test_cases, metrics=[hallucination_metric, answer_relevancy_metric], throttle_value=60)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 10 test case(s) in parallel: |████████████████████████████|100% (10/10) [Time Taken: 10:00, 60.06s/test case]



Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual contradictions or alignments to justify a hallucination score., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The author argues that "the very idea of bodily autonomy" is under attack regarding reproductive rights and calls for "a unified front" to fight for reproductive freedom and access to healthcare., error: None)

For test case:

  - input: What does the author argue is under attack regarding reproductive rights and what steps does the author call for to address this issue? 

  - actual output: The author argues that the constitutional right to choose, established by Roe v. Wade, is under attack. The author calls for Congress to codify Roe v. Wade to protect this right and ensure access to healthcare, including reproductive healthcar




In [40]:
# 
======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual contradictions or alignments to justify a hallucination score., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The author argues that "the very idea of bodily autonomy" is under attack regarding reproductive rights and calls for "a unified front" to fight for reproductive freedom and access to healthcare., error: None)

For test case:

  - input: What does the author argue is under attack regarding reproductive rights and what steps does the author call for to address this issue? 

  - actual output: The author argues that the constitutional right to choose, established by Roe v. Wade, is under attack. The author calls for Congress to codify Roe v. Wade to protect this right and ensure access to healthcare, including reproductive healthcare. 

  - expected output: The author argues that the constitutional right to abortion, established by Roe v. Wade, is under attack. To address this issue, the author calls for protecting access to health care and preserving a woman's right to choose.
  - context: []
  - retrieval context: ["['The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.'", " 'Congress must restore the right the Supreme Court took away last year and codify Roe v. Wade to protect every woman’s constitutional right to choose.'", " 'If we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America.'", " 'There are state laws banning the right to choose", ' criminalizing doctors', " and forcing survivors of rape and incest to leave their states as well to get the care they need.'", " 'The Vice President and I are doing everything we can to protect access to reproductive health care and safeguard patient privacy. But already", " more than a dozen states are enforcing extreme abortion bans.'", " 'To my friends across the aisle", " don’t keep families waiting any longer. Guarantee the right to IVF nationwide!'", " 'Many of you in this Chamber and my predecessor are promising to pass a national ban on reproductive freedom.'", " 'I see a future where we restore the right to choose and protect other freedoms not take them away.'", " 'In its decision to overturn Roe v. Wade the Supreme Court majority wrote", " “Women are not without –'", " 'Advancing liberty and justice also requires protecting the rights of women.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no contradictions between the actual output and the contexts., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 1.00 because the JSON response accurately reflects the provided schema and is a valid JSON object. , error: None)

For test case:

  - input: In the speech from President Biden, why has "trickle-down economics" not worked, and what economic approach does he advocate for instead?
  - actual output: The speaker believes that "trickle-down economics" has failed because it has led to lower wages, larger deficits, and a wider gap between the wealthy and everyone else. The speaker advocates for an economic approach that builds from the bottom up and the middle out, investing in all Americans to ensure everyone has a fair shot. 

  - expected output: President Biden states that "trickle-down economics" has never worked because it has not resulted in economic growth from the bottom and middle out. Instead, he advocates for an economic approach that grows the economy from the bottom and middle out.
  - context: []
  - retrieval context: ["['My fellow Americans", " trickle-down — trickle-down economics has never worked and it’s time to grow the economy from the bottom and the middle out. (Applause.)'", " 'But that trickle-down theory led to weaker economic growth", ' lower wages', ' bigger deficits', " and the widest gap between those at the top and everyone else in nearly a century.'", " 'For the past 40 years we were told that if we gave tax breaks to those at the very top", " the benefits would trickle down to everyone else.'", " 'To build an economy from the bottom up and the middle out", ' not from the top down. Because when the middle class does well', " the poor have a ladder up and the wealthy still do very well. We all do well.'", " 'Vice President Harris and I ran for office with a new economic vision for America.'", " 'America’s comeback is building a future of American possibilities", ' building an economy from the middle out and the bottom up', ' not the top down', ' investing in all of America', " in all Americans to make sure everyone has a fair shot and we leave no one behind!'", " 'A future where the days of trickle-down economics are over and the wealthy and biggest corporations no longer get all the breaks.'", " 'When we invest in our workers", ' when we build the economy from the bottom up and the middle out together', " we can do something we haven’t done in a long time: build a better America.'", " 'And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans", " the American Rescue Plan helped working people—and left no one behind.'", " 'Our economy grew at a rate of 5.7% last year", ' the strongest growth in nearly 40 years', " the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no contradictions between the actual output and the contexts., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The speaker's stance on the "Buy American" policy is not provided in the input, so it is impossible to determine their stance or how it relates to past administrations. Therefore, it is not possible to generate a JSON response., error: None)

For test case:

  - input: What is the speaker's stance on the "Buy American" policy and how does it relate to past administrations? 

  - actual output: The speaker strongly advocates for the "Buy American" policy, believing it should be strictly enforced. They criticize past administrations for circumventing the policy, which they view as a betrayal of the law and a detriment to American jobs. The speaker emphasizes that this administration will prioritize American products and create jobs by utilizing taxpayer dollars to purchase them. 

  - expected output: The speaker believes in the "Buy American" policy and states that it has been the law of the land since the 1930s. The speaker criticizes past administrations, including his predecessor, for failing to enforce this policy. He asserts that under his watch, federal projects will utilize American products and create jobs for American workers.
  - context: []
  - retrieval context: ["['Past administrations including my predecessor failed to Buy American.'", " 'Buy American has been the law of the land since 1933. But for too long", " past administrations have found ways to get around it.'", " 'And I might note", " parenthetically — (applause) — that does not — that does not violate any trade agreement.  It’s been the law since the ’30s: Buy American.'", " 'American tax dollars are going to be used to buy American products made in America to create American jobs.  That’s the way it’s supposed to be and it will be in this administration.  (Applause.)'", " 'When we use taxpayer dollars to rebuild America – we are going to Buy American: buy American products to support American jobs.'", " 'And all the investments in the American Jobs Plan will be guided by one principle: Buy American.  (Applause.)  Buy American.'", " 'And I made it clear to all my Cabinet people.  Their ability to give exemptions has been exstrenuously [sic] limited.  It will be American products.'", " 'We will buy American to make sure everything from the deck of an aircraft carrier to the steel on highway guardrails are made in America.'", " 'For too many decades", " we imported products and exported jobs.'", " 'Buy American has been the law of the land since the 1930s.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual contradictions or alignments., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 1.00 because the model understood the request and provided a valid JSON response, which is expected given the input and the provided schema.  It's a good start!  Keep up the good work!  , error: None)

For test case:

  - input: What measures does the speaker propose to lower prescription drug costs in America? 

  - actual output: The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade. 

  - expected output: The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.
  - context: []
  - retrieval context: ["['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)'", " 'In fact", ' we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug', ' nearly three times what other countries pay.  We have to change that', " and we can.'", " 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.'", " 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.'", " 'And", ' by the way', " that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.'", " 'Now I want to cap prescription drug costs at $2", "000 a year for everyone!'", " 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.'", " 'Make no mistake", ' if you try to do anything to raise the cost of prescription drugs', " I will veto it.'", " 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.'", " 'It will cut the federal deficit", " saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no contradictions between the actual output and the contexts., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: President Biden has called for a de-escalation of violence and a ceasefire. He has also stated that the United States is committed to Israel's security and will continue to provide military aid. In terms of long-term solutions, Biden has called for a two-state solution to the Israeli-Palestinian conflict, but has not provided specific details on how this would be achieved., error: None)

For test case:

  - input: What are President Biden's proposals for addressing the ongoing conflict between Israel and Hamas, including both immediate and long-term solutions?
  - actual output: The President proposes a two-state solution as the long-term solution to the conflict. As an immediate solution, he proposes establishing a temporary pier in the Mediterranean on the Gaza coast to receive food, water, medicine, and temporary shelters. He also calls for Hamas to release hostages, lay down arms, and surrender those responsible for the recent attacks.  He acknowledges the need for humanitarian assistance in Gaza and emphasizes the importance of protecting innocent civilians. 

  - expected output: President Biden proposes a six-week ceasefire to address the immediate crisis, including the release of hostages and relief of the humanitarian situation. He also emphasizes the need for Israel to allow more humanitarian aid into Gaza and ensure the safety of humanitarian workers.  In the long term, Biden advocates for a two-state solution, arguing that it is the only path to secure Israel's future, guarantee peace and dignity for Palestinians, and ensure peace between Israel and its Arab neighbors, including Saudi Arabia.
  - context: []
  - retrieval context: ["['As we look to the future", " the only real solution is a two-state solution.'", " 'Hamas could end this conflict today by releasing the hostages", ' laying down arms', " and surrendering those responsible for October 7th.'", " 'Tonight", ' I’m directing the U.S. military to lead an emergency mission to establish a temporary pier in the Mediterranean on the Gaza coast that can receive large ships carrying food', ' water', " medicine and temporary shelters.'", " 'Israel has an added burden because Hamas hides and operates among the civilian population. But Israel also has a fundamental responsibility to protect innocent civilians in Gaza.'", " 'The United States has been leading international efforts to get more humanitarian assistance into Gaza.'", " 'I know the last five months have been gut-wrenching for so many people", ' for the Israeli people', ' the Palestinian people', " and so many here in America.'", " 'There is no other path that guarantees Palestinians can live with peace and dignity.'", " 'Israel has a right to go after Hamas.'", " 'There is no other path that guarantees Israel’s security and democracy.'", " 'Israel must allow more aid into Gaza and ensure that humanitarian workers aren’t caught in the cross fire.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual contradictions or alignments., error: None)
  - ❌ Answer Relevancy (score: 0.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because the response is empty and does not address the input question about Biden's comparison of a threat to American democracy to terrorism and his plan to address it., error: None)

For test case:

  - input: How does Biden compare a threat to American democracy to terrorism, and what's his plan to address it?
  - actual output: Empty Response
  - expected output: The context discusses the threat to American democracy, but it does not explicitly compare it to terrorism. The speech focuses on the attack on the Capitol and the need to protect voting rights. The speaker emphasizes the need to restore faith in democracy and to address the challenges facing the nation. However, it does not directly draw a comparison to terrorism.
  - context: []
  - retrieval context: ["['January 6th and the lies about the 2020 election", ' and the plots to steal the election', " posed the gravest threat to our democracy since the Civil War.'", " 'And two years ago", " our democracy faced its greatest threat since the Civil War.'", " 'For the last few years our democracy has been threatened", ' attacked', " and put at risk.'", " 'Not since President Lincoln and the Civil War have freedom and democracy been under assault here at home as they are today.'", " 'We must all speak out. There is no place for political violence in America. In America", ' we must protect the right to vote', ' not suppress that fundamental right. We honor the results of our elections', " not subvert the will of the people. We must uphold the rule of the law and restore trust in our institutions of democracy.'", " 'But we must be honest the threat remains and democracy must be defended.'", " 'Insurrectionists stormed this very Capitol and placed a dagger at the throat of American democracy.'", " 'And we won’t ignore what our intelligence agencies have determined to be the most lethal terrorist threat to the homeland today: White supremacy is terrorism.  We’re not going to ignore that either.'", " 'What makes our moment rare is that freedom and democracy are under attack", ' both at home and overseas', " at the very same time.'", " 'But the struggle is far from over.  The question of whether our democracy will long endure is both ancient and urgent", " as old as our Republic — still vital today.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual alignments or contradictions., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 1.00 because the JSON response is relevant and complete, providing the exact information requested in the input. It is a perfect answer! , error: None)

For test case:

  - input: What gun violence prevention actions did President Biden mention after discussing the recent gun safety law in his State of the Union address?
  - actual output: President Biden called for Congress to pass universal background checks, close the "boyfriend" loophole in the Violence Against Women Act, and enact red flag laws. 

  - expected output: President Biden mentioned the need to ban assault weapons after discussing the recent gun safety law in his State of the Union address. He stated that the United States had previously banned assault weapons in 1994 and that mass shootings had decreased during that time. After the ban expired, he argued, mass shootings had tripled.
  - context: []
  - retrieval context: ["['And I will do everything in my power to protect the American people from this epidemic of gun violence", " but it’s time for Congress to act as well.  (Applause.)'", " 'I did do something by establishing the first-ever Office of Gun Violence Prevention in the White House that Vice President Harris is leading.'", " 'More than two weeks ago in the Rose Garden", " surrounded by some of the bravest people I know — the survivors and families who lost loved ones to gun violence — I laid out several of the Department of Justice a- — actions that are being taken to — impact on this epidemic.'", " 'Thank God we did", " passing the most sweeping gun safety law in three decades.'", " 'Passed a gun safety law making historic investments in mental health.'", " 'Another thing: Let’s authorize the Violence Against Women Act", ' which has been law for 27 years.  (Applause.)  Twenty-seven years ago', ' I wrote it.  It’ll close the — the act that has to be authorized now will close the “boyfriend” loophole to keep guns out of the hands of abusers.  The court order said', " “This is an abuser.  You can’t own a gun.”  It’s to close that loophole that existed.'", " 'And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon?'", " 'I’m proud we beat the NRA when I signed the most significant gun safety law in nearly 30 years!'", " 'That includes things that the majority of responsible gun owners support", " like enhanced background checks for 18 to 21-year-olds and red flag laws keeping guns out of the hands of people who are a danger to themselves and others.'", " 'These laws don’t infringe on the Second Amendment. They save lives.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no contradictions between the actual output and the contexts., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: President Biden rejected "trickle-down economics" in his 2023 State of the Union address, using the example of the 1980s "Reaganomics" era to argue that it led to increased inequality and did not benefit the middle class. He instead advocated for policies that he claimed would benefit the middle class, such as investments in infrastructure and clean energy., error: None)

For test case:

  - input: What economic policy did President Biden reject in his 2023 State of the Union address, and what example did he use to support his alternative?
  - actual output: The speaker rejected trickle-down economics and supported growing the economy from the bottom and the middle out. They used the American Rescue Plan as an example of their alternative policy. 

  - expected output: President Biden rejected trickle-down economics in his 2023 State of the Union address, arguing that it has never worked and that the economy should be grown from the bottom and the middle out. He used the example of the 2017 tax cut, which was supposed to pay for itself but instead added $2 trillion to the deficit, as evidence that trickle-down economics fails.
  - context: []
  - retrieval context: ["['My fellow Americans", " trickle-down — trickle-down economics has never worked and it’s time to grow the economy from the bottom and the middle out. (Applause.)'", " 'Vice President Harris and I ran for office with a new economic vision for America.'", " 'So that’s my plan. It will grow the economy and lower costs for families.'", " 'But that trickle-down theory led to weaker economic growth", ' lower wages', ' bigger deficits', " and the widest gap between those at the top and everyone else in nearly a century.'", " 'Our economy grew at a rate of 5.7% last year", ' the strongest growth in nearly 40 years', " the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long.'", " 'And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans", " the American Rescue Plan helped working people—and left no one behind.'", " 'You know", ' there’s a broad consensus of economists — left', ' right', " center — and they agree what I’m proposing will help create millions of jobs and generate historic economic growth.  These are among the highest-value investments we can make as a nation.'", " 'The investments I’ve proposed tonight also advance the foreign policy", ' in my view', ' that benefits the middle class.  That means making sure every nation plays by the same rules in the global economy', " including China.'", " '17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan:'", " 'We’re going to reform corporate taxes so they pay their fair share and help pay for the public investments their businesses will benefit from as well.  (Applause.)']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The hallucination score is 0.00 because there are no contradictions or factual misalignments between the actual output and the contexts., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The President's plan to address societal issues and veteran concerns is not provided in the input. Therefore, I cannot generate a JSON object that reflects that information.  Please provide me with the details of the plan., error: None)

For test case:

  - input: How does the President's plan address societal issues and veteran concerns?
  - actual output: The President's plan addresses societal issues by investing in families and children, aiming to cut child poverty in half, and providing assistance with job training, housing, and healthcare. It also addresses veteran concerns by expanding access to VA care, expanding mental health screenings, and recruiting veterans to help other veterans. Additionally, the plan aims to address systemic racism and provide real opportunities for all Americans, including expanding eligibility for veterans suffering from respiratory cancers. 

  - expected output: The President's plan addresses societal issues by focusing on immigration reform, protecting women's rights, and advancing LGBTQ+ rights. He also emphasizes the need for unity and addresses the opioid epidemic, mental health, and veteran concerns.  His plan includes increasing funding for prevention, treatment, and recovery for opioid addiction, providing mental health services for children and adults, and supporting veterans with job training, housing, and medical care.
  - context: []
  - retrieval context: ["['My administration is providing assistance with job training and housing", " and now helping lower-income veterans get VA care debt-free.'", " 'We’re making one of the largest one-time ever investments — ever — in improving healthcare for veterans.  Critical investments to address the opioid crisis.  And", ' maybe most importantly', ' thanks to the American Rescue Plan', " we’re on track to cut child poverty in America in half this year.  (Applause.)'", " 'The VA is doing everything it can", " including expanding mental health screenings and a proven program that recruits veterans to help other veterans understand what they’re going through and get the help they need.'", " 'So", ' let’s get to work.  I wanted to lay out', ' before the Congress', " my plan before we got into the deep discussions.  I’d like to meet with those who have ideas that are different — they think are better.  I welcome those ideas.'", " 'In addition to my Families Plan", ' I’m going to work with Congress to address', ' this year', " other critical priorities for American families.'", " 'To win that competition for the future", ' in my view', ' we also need to make a once-in-a-generation investment in our families and our children.  That’s why I’ve introduced the American Families Plan tonight', ' which addresses four of the biggest challenges facing American families and', ' in turn', " America.'", " '17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan:'", " 'And we cannot go on losing 17 veterans a day to the silent scourge of suicide.'", " 'The country supports this reform", ' and Congress should act — should act.  We have a giant opportunity to bend to the arc of the moral universe towards justice — real justice.  And with the plans outlined tonight', ' we have a real chance to root out systemic racism that plagues America and American lives in other ways; a chance to deliver real equity — good jobs', ' good schools', ' affordable housing', ' clean air', ' clean water', ' being able to generate wealth and pass it down two generations because you have an access to purchase a house.  Real opportunities in the lives of more Americans — Black', ' white', ' Latino', ' Asian Americans', " Native Americans.'", " 'And tonight", " I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers.']"]

======================================================================

Metrics Summary

  - ✅ Hallucination (score: 0.0, threshold: 0.3, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 0.00 because there are no factual contradictions or alignments., error: None)
  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Gemini 1.5 Flash, reason: The score is 1.00 because the input asks a complex question about Biden's foreign policy, and the model is appropriately responding with a JSON object that adheres to the schema, ready for further processing and analysis. This showcases the model's understanding of the task and its ability to structure output in a useful format.  , error: None)

For test case:

  - input: How will Biden address China's trade practices and maintain a strong Indo-Pacific presence?
  - actual output: The speaker will address unfair trade practices by China, such as subsidies to state-owned operations, theft of technology, and intellectual property. He will also maintain a strong military presence in the Indo-Pacific to prevent conflict. 

  - expected output: Biden stated that he will defend America's interests and stand up to unfair trade practices, including subsidies from state-owned operations and enterprises and the theft of American technology and intellectual property. He also stated that the US will maintain a strong military presence in the Indo-Pacific, just as it does with NATO in Europe, not to start a conflict, but to prevent one.
  - context: []
  - retrieval context: ["['I also told President Xi that we’ll maintain a strong military presence in the Indo-Pacific", ' just as we do with NATO in Europe — not to start a conflict', " but to prevent one.  (Applause.)'", " 'In my discussions — in my discussions with President Xi", ' I told him', ' “We welcome the competition.  We’re not looking for conflict.”  But I made absolutely clear that we will defend America’s interests across the board.  America will stand up to unfair trade practices that undercut American workers and American industries', " like subsidies from state — to state-owned operations and enterprises and the theft of American technology and intellectual property.'", " 'The investments I’ve proposed tonight also advance the foreign policy", ' in my view', ' that benefits the middle class.  That means making sure every nation plays by the same rules in the global economy', " including China.'", " 'I am committed to work with China where it can advance American interests and benefit the world.'", " 'It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China.'", " 'I’ve made clear with President Xi that we seek competition", " not conflict.'", " 'I will make no apologies that we are investing to make America strong. Investing in American innovation", ' in industries that will define the future', " and that China’s government is intent on dominating.'", " 'But to compete for the best jobs of the future", " we also need to level the playing field with China and other competitors.'", " 'Today", " we’re in the strongest position in decades to compete with China or anyone else in the world.'", " 'And bridges are forming between partners in the Pacific and those in the Atlantic. And those who bet against America are learning just how wrong they are.']"]

======================================================================

Overall Metric Pass Rates

Hallucination: 100.00% pass rate
Answer Relevancy: 90.00% pass rate

======================================================================


🎉 Tests finished ✅! Run 'deepeval login' to view evaluation results on Confident AI. ‼️ NOTE: You can also run 
evaluations on ALL of deepeval's metrics directly on Confident AI instead.



NameError: name 'evaldatasaset_results' is not defined

In [27]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

In [30]:
# "generic" evaluation of generation
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)
                                 
answer_relevancy.measure(test_case)
print("Answer Relevancy Score: ", answer_relevancy.score)
print("Answer Relevancy Reason: ", answer_relevancy.reason)

faithfulness.measure(test_case)
print("Faithfulness Score: ", faithfulness.score)
print("Faithfulness Reason: ", faithfulness.reason)

Output()

Output()

Answer Relevancy Score:  1.0
Answer Relevancy Reason:  The score is 1.00 because the output provides the perfect JSON response based on the given schema, which is exactly what is needed.


Faithfulness Score:  1.0
Faithfulness Reason:  Amazing! You've got no contradictions, which means the actual output perfectly aligns with the retrieval context. Keep up the great work!


In [None]:
#Answer Relevancy Score:  1.0
#Answer Relevancy Reason:  The score is 1.00 because the provided context is missing the speaker's proposed measures to lower prescription drug costs, making it impossible to provide a relevant response.

#Faithfulness Score:  1.0
#Faithfulness Reason:  Great job! The 'actual output' perfectly aligns with the information in the 'retrieval context'. Keep up the amazing work! 

# rerun
#Answer Relevancy Score:  1.0
#Answer Relevancy Reason:  The speaker proposes the following measures to lower prescription drug costs in America:  * Allow Medicare to negotiate drug prices.  * Cap out-of-pocket drug costs for seniors.  *  Limit price increases on drugs to the rate of inflation.  * Encourage generic drug competition.

#Faithfulness Score:  1.0
#Faithfulness Reason:  The score is 1.00 because there are no contradictions

In [37]:
# Additional ways to form json response

In [36]:
class User(BaseModel):
    name: str
    age: int

In [61]:
assert isinstance(test, User)
assert test.resp == "Tiki"
assert test.age == 25