In [63]:
# RAG with Gemini Flash 1.5 LLM and DeepEval evaluation
# Google Gemini: https://ai.google.dev/gemini-api/docs/models/gemini
# DeepEval: https://docs.confident-ai.com/docs/guides-rag-evaluation

In [64]:
# Establish RAG pipeline

In [3]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from IPython.display import display
from IPython.display import Markdown
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss
import instructor
import deepeval

In [4]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [5]:
deepeval.telemetry_opt_out()

True

In [6]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [7]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [8]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [9]:
# create document database
# using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online
# Example from 2024:
# https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/
sotu = []
#files = ["./Speeches/state_of_the_union_042921.txt", "./Speeches/state_of_the_union_030122.txt", "./Speeches/state_of_the_union_020723.txt", "./Speeches/state_of_the_union_030724.txt"]
# retrained 9/9 with dates in title of each speech
newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
for i in newfiles:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [10]:
documents = [Document(text=line) for line in sotu]

In [11]:
# Example of a loaded Document line
documents[-1]

Document(id_='d0c4cf64-b4ca-44b0-b2dc-4037ed1230a9', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [12]:
# Set up the faiss index
d = 768 # dimensions of ___, the embedding model that we're going to use
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [13]:
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
Settings.llm = llm

In [14]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)

In [15]:
## save index to disk
#index.storage_context.persist()
#index

In [16]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context, index_id='cef7ae30-ff1e-404a-bce6-85d59ca4b376')

In [18]:
query_engine = index.as_query_engine(similarity_top_k=10)
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [55]:
# Example query and response
query = "In detail, what has the President done to improve the economy over the four years of his speeches?"
response = query_engine.query(query)

In [56]:
print(response.response)

The President highlights the creation of over 1.3 million jobs in the first 100 days of his term, a record 12 million jobs created in two years, and a strong economic growth rate of 5.7% in the previous year. He also emphasizes the International Monetary Fund's prediction of an economic growth rate exceeding 6% for the current year. 



In [19]:
# Start of DeepEval implementation
# https://docs.confident-ai.com/docs/guides-rag-evaluation

In [20]:
from pydantic import BaseModel
from deepeval.models import DeepEvalBaseLLM

In [21]:
class Response(BaseModel):
    response: str

In [22]:
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [23]:
from typing import List, Optional
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from deepeval.models import DeepEvalBaseEmbeddingModel

class CustomGeminiEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        pass

    def load_model(self):
        return GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom Gemini Embeddings"

In [24]:
custom_geminiflash = CustomGeminiFlash()
custom_geminiembeddings = CustomGeminiEmbeddingModel()

In [22]:
test = custom_geminiflash.generate(prompt="How many different types of clouds are there?", schema=Response)

In [23]:
test.response

'There are many different ways to classify clouds, but generally, there are ten main types of clouds. These are further categorized into three families, based on their altitude. \n\n**High-level clouds:**\n* **Cirrus (Ci):** Wispy, detached clouds made of ice crystals. They often appear as delicate strands or feathers.\n* **Cirrocumulus (Cc):** Small, white, puffy clouds arranged in rows or patches. They resemble ripples on a lake.\n* **Cirrostratus (Cs):** Thin, sheet-like clouds that can cover the entire sky. They often produce a halo around the sun or moon.\n\n**Mid-level clouds:**\n* **Altocumulus (Ac):** White or gray patches of clouds arranged in rounded masses or layers. They can resemble cotton balls.\n* **Altostratus (As):** Gray or bluish sheet-like clouds that can cover the entire sky. They often obscure the sun or moon, but do not produce a halo.\n* **Nimbostratus (Ns):** Dark gray clouds that produce steady rain or snow. They often cover the entire sky and can be quite thi

In [25]:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

def test_answer_relevancy():
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=custom_geminiflash)
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        # Replace this with the actual output of your LLM application
        actual_output="We offer a 30-day full refund at no extra cost."
    )
    assert_test(test_case, [answer_relevancy_metric])

In [26]:
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)

contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

test_case = LLMTestCase(
    input="What measures does the speaker propose to lower prescription drug costs in America?",
    actual_output="The speaker proposes giving Medicare the power to negotiate lower prescription drug prices, capping prescription drug costs at $2,000 a year for everyone, and allowing Medicare to negotiate lower prices for 500 drugs over the next decade.s",
    expected_output="The speaker proposes that Medicare should be given the power to negotiate lower drug prescription prices. They argue that this would save hundreds of billions of dollars and lower prescription drug costs for everyone. The speaker also states that the money saved could be used to strengthen the Affordable Care Act and expand Medicare coverage benefits without costing taxpayers an additional penny.",
    retrieval_context=['Let’s do what we’ve always talked about for all the years I was down here in this — in this body — in Congress.  Let’s give Medicare the power to save hundreds of billions of dollars by negotiating lower drug prescription prices.  (Applause.)', 'In fact, we pay the highest prescription drug prices of anywhere in the world right here in America — nearly three times — for the same drug, nearly three times what other countries pay.  We have to change that, and we can.', 'And we’re finally giving Medicare the power to negotiate drug prices. Bringing down prescription drug costs doesn’t just save seniors money.', 'For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs just like the VA does for our veterans.', 'And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone.', 'Now I want to cap prescription drug costs at $2,000 a year for everyone!', 'We know how to do this.  The last President had that as an objective.  We all know how outrageously expensive drugs are in America.', 'Make no mistake, if you try to do anything to raise the cost of prescription drugs, I will veto it.', 'Now it’s time to go further and give Medicare the power to negotiate lower prices for 500 drugs over the next decade.', 'It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare.']
)

In [29]:
# Evaluation of the retriever
contextual_precision.measure(test_case)
print("Contextual Precision Score: ", contextual_precision.score)
print("Contextual Precision Reason: ", contextual_precision.reason)

contextual_recall.measure(test_case)
print("Contextual Recall Score: ", contextual_recall.score)
print("Contextual Recall Reason: ", contextual_recall.reason)

contextual_relevancy.measure(test_case)
print("Contextual Relevancy Score: ", contextual_relevancy.score)
print("Contextual Relevancy Reason: ", contextual_relevancy.reason)

Output()

Output()

Contextual Precision Score:  0.9095238095238096
Contextual Precision Reason:  The score is 0.91 because the first five nodes in the retrieval context are relevant, while the last four are not. The 'no' verdicts should be ranked lower as they do not explicitly propose measures to lower prescription drug costs, with the sixth node focusing on a consequence instead of a measure, the seventh node mentioning the high cost but not offering a solution, and the eighth and ninth nodes mentioning opposition to raising costs and the potential savings but not a specific measure.


Output()

Contextual Recall Score:  1.0
Contextual Recall Reason:  The score is 1.00 because the speaker proposes Medicare to negotiate lower drug prices and this is supported by the first node in the retrieval context.  This node directly discusses Medicare's power to negotiate lower drug prices and the potential savings.


Contextual Relevancy Score:  0.4
Contextual Relevancy Reason:  The score is 0.4 because the context mentions lowering drug costs but doesn't mention any "specific measures" to achieve that, as per the reasons for irrelevancy. For example, "The context discusses the high prescription drug prices in America compared to other countries, but it doesn't mention any specific measures to lower costs."


In [None]:
#Contextual Precision Score:  0.9472222222222222
#Contextual Precision Reason:  The score is 0.95 because the 'no' verdicts are ranked lower than the 'yes' verdicts, since nodes 7 and 8 are irrelevant because node 7 discusses past attempts to address the issue but does not offer a solution, and node 8 focuses on the speaker's stance against any measures that might raise prescription drug costs, while the 'yes' verdicts provide information about how to lower drug costs, like node 1 mentioning "Medicare should be given the power to negotiate lower drug prescription prices". This is followed by other nodes, like node 2, which talks about paying "the highest prescription drug prices of anywhere in the world right here in America", while node 3 mentions "we’re finally giving Medicare the power to negotiate drug prices" and node 4 explains that "For years people have talked about it but I finally got it done and gave Medicare the power to negotiate lower prices for prescription drugs". Node 5 states that "And, by the way, that won’t just — that won’t just help people on Medicare; it will lower prescription drug costs for everyone". Node 6 proposes capping prescription drug costs at "$2,000 a year for everyone!". Node 9 proposes to "give Medicare the power to negotiate lower prices for 500 drugs over the next decade." Lastly, node 10 states that "It will cut the federal deficit, saving tax payers hundreds of billions of dollars on the prescription drugs the government buys for Medicare." Therefore, the score is 0.95 because the 'no' verdicts are ranked lower than the 'yes' verdicts.

#Contextual Recall Score:  1.0
#Contextual Recall Reason:  The score is 1.00 because all the sentences in the expected output are supported by node(s) in retrieval context, as they accurately reflect information presented in the retrieval context.  For example, the speaker's proposal to give Medicare the power to negotiate lower prescription drug prices is supported by multiple nodes in the retrieval context, and the potential savings and impact on drug costs for everyone is also accurately conveyed.  

#Contextual Relevancy Score:  0.4
#Contextual Relevancy Reason:  The score is 0.40 because the context discusses lowering prescription drug costs but doesn't elaborate on specific measures to achieve that, focusing instead on the speaker's stance on drug pricing. For example, the context states, "The context only mentions giving Medicare the power to negotiate drug prices, but doesn't elaborate on specific measures proposed to lower costs." and "The context only states that the speaker will veto anything that tries to raise the cost of prescription drugs, but it does not mention any measures to lower prescription drug costs."

# Rerun
#Score:  0.9472222222222222
#Reason:  The score is 0.95 because the relevant nodes are ranked higher than the irrelevant ones. For example, the first node directly mentions the speaker's proposal, making it highly relevant. The eighth node, however, only discusses a potential veto of legislation and is not directly related to measures to lower prescription drug costs. This makes the eighth node less relevant than the first, which is a reason why the contextual precision score is 0.95 and not higher.

#Score:  1.0
#Reason:  The score is 1.00 because all the information in the expected output is directly reflected in the retrieval context. The first node in the retrieval context contains multiple sentences that cover the core points of the expected output, such as giving Medicare power to negotiate lower drug prices, saving hundreds of billions of dollars, and lowering costs for everyone. The fourth node confirms that the speaker finally gave Medicare the power to negotiate lower prices, and the fifth node emphasizes that this will not only help Medicare recipients but also lower costs for everyone. The ninth node provides additional support by highlighting the impact on the federal deficit and taxpayers. Overall, the retrieval context perfectly matches the expected output with clear and consistent information.

#Score:  0.3
#Reason:  The score is 0.3 because the context discusses lowering prescription drug costs but lacks specific measures. The context only mentions giving Medicare the power to negotiate prices, but doesn't explain how this would be done or propose any other measures. As stated in the reasons, "The context mentions lowering prescription drug costs, but it doesn't provide any specific measures." and "The context only mentions giving Medicare the power to negotiate drug prices, but it does not elaborate on specific measures to lower prescription drug costs."

# Rerun
#Contextual Precision Score:  0.9095238095238096
#Contextual Precision Reason:  The score is 0.91 because the irrelevant nodes are ranked lower than the relevant nodes. The sixth node discusses capping costs, which is not the speaker's proposed solution. The seventh node acknowledges the issue but doesn't offer solutions. The eighth node states the speaker's opposition to raising prices but not their own proposals. All other nodes are relevant and are ranked higher than these three irrelevant nodes.

#Contextual Recall Score:  0.4444444444444444
#Contextual Recall Reason:  The score is 0.44 because the first sentence in the expected output is supported by the first node in the retrieval context, but the rest of the sentences are not supported by anything in the retrieval context.

#Contextual Relevancy Score:  0.5
#Contextual Relevancy Reason:  The score is 0.50 because the context touches on lowering drug costs, but lacks specific proposals, mentioning only the speaker's stance and impact on the deficit. "The context mentions that prescription drug prices in America are the highest in the world, but it does not propose any measures to lower these costs." and "The context discusses the impact of the policy on the federal deficit and Medicare spending, but it does not mention any specific measures to lower prescription drug costs in America." accurately describe this limitation.


In [70]:
from deepeval.dataset import EvaluationDataset

evaldataset = EvaluationDataset()
evaldataset.add_test_cases_from_csv_file(
    file_path="datasets/manual_dataset_complete.csv",
    input_col_name="Input",
    actual_output_col_name="Actual_Output",
    expected_output_col_name="Expected_Output",
#    context_col_name="context",
#    context_col_delimiter= "\n",
    retrieval_context_col_name="Retrieval_Context",
    retrieval_context_col_delimiter= ","
#    additional_metadata_col_name="source_file"
)

In [72]:
evaldataset.test_cases

[LLMTestCase(input='How does the President plan to grow the economy in his first speech, and what are examples of his successes in this area as shared in his next three speeches?', actual_output='The President plans to grow the economy by focusing on the bottom and middle class, rather than trickle-down economics.  His success in this area includes creating more than 1,300,000 new jobs in his first 100 days, a record 12 million new jobs in his first two years, and a 5.7% growth in the economy last year, which is the strongest growth in nearly 40 years. \n', expected_output='In his first speech, the President proposes that his American Jobs Plan will create millions of new jobs and trillions of dollars of economic growth to the economy by bringing Americans back to the workforce. He also says America should grow its economy from the bottom and middle out but does not say how to grow the economy that way. In his following speeches, he says that the American Rescue Plan has created over 6

In [74]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

#hallucination_metric = HallucinationMetric(threshold=0.3, model=custom_geminiflash)
#answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=custom_geminiflash)
# Retriever metrics:
contextual_precision = ContextualPrecisionMetric(model=custom_geminiflash)
contextual_recall = ContextualRecallMetric(model=custom_geminiflash)
contextual_relevancy = ContextualRelevancyMetric(model=custom_geminiflash)

# Generation metrics:
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)

# You can also call the evaluate() function directly
# No I can't.
#evaldataset_results = evaluate(evaldataset, [hallucination_metric, answer_relevancy_metric])
# Calling the evaluate function on a EvaluationDataset only permits the metrics parameter, not any others like throttle_value... so the below results in ratelimiting errors
#evaldataset_results = evaldataset.evaluate(metrics=[hallucination_metric, answer_relevancy_metric]) 

In [75]:
# hit errors for all 5 metrics with throttle_value=60 and =90
# Trying retriever metrics first now
# later try answer_relevancy, faithfulness
# Started getting resource exhausted errors with contextual_relevancy, will try just contextual_precision and contextual_recall
# Got through 4 test cases before resource exhausted errors on contextual_precision... will try metrics separately

# for contextual precision, got json validation error at the last test case
# ValidationError: 1 validation error for Verdicts
# Invalid JSON: expected `,` or `}` at line 1 column 251 [type=json_invalid, input_value='{"verdicts": [{"verdict"... wrong they are.\' "}]}', input_type=str]

# Same type of Verdicts error for contextual_recall
#ValidationError: 1 validation error for Verdicts
#  Invalid JSON: EOF while parsing a list at line 1 column 31251 [type=json_invalid, input_value='{"verdicts": [{"verdict"...d the middle out...\'"}', input_type=str]

# eval_contextprecision = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_precision], throttle_value=90)
# Trying without reason
eval_contextrecall = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_recall], throttle_value=90)
#eval_contextrelevancy = evaluate(test_cases=evaldataset.test_cases, metrics=[contextual_relevancy], throttle_value=90)

# still getting errors with json, try working with basemodel next week
# https://docs.confident-ai.com/docs/guides-using-custom-llms

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 11 test case(s) in parallel: |████████████████████████▌  | 91% (10/11) [Time Taken: 17:35, 105.56s/test case]


InstructorRetryException: RetryError[<Future at 0x7f248bdd2a10 state=finished raised ValidationError>]

In [31]:
# test RAGAS metrics
from deepeval import evaluate
from deepeval.metrics.ragas import RagasMetric
from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric
from deepeval.metrics.ragas import RAGASFaithfulnessMetric
from deepeval.metrics.ragas import RAGASContextualRecallMetric
from deepeval.metrics.ragas import RAGASContextualPrecisionMetric

ragasmetric = RagasMetric(model=custom_geminiflash, embeddings=custom_geminiembeddings)
ragas_ar = RAGASAnswerRelevancyMetric(model=custom_geminiflash, embeddings=custom_geminiembeddings)
ragas_f = RAGASFaithfulnessMetric(model=custom_geminiflash)
ragas_cr = RAGASContextualRecallMetric(model=custom_geminiflash)
ragas_cp = RAGASContextualPrecisionMetric(model=custom_geminiflash)

In [None]:
# stopped here, failed with metric.
eval_ragas = evaluate(test_cases=evaldataset.test_cases, metrics=[ragasmetric], throttle_value=90)

In [None]:
eval_ragas_all = evaluate(test_cases=evaldataset.test_cases, metrics=[ragas_ar, ragas_f, ragas_cr, ragas_cp], throttle_value=90)

In [27]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric

In [30]:
# "generic" evaluation of generation
answer_relevancy = AnswerRelevancyMetric(model=custom_geminiflash)
faithfulness = FaithfulnessMetric(model=custom_geminiflash)
                                 
answer_relevancy.measure(test_case)
print("Answer Relevancy Score: ", answer_relevancy.score)
print("Answer Relevancy Reason: ", answer_relevancy.reason)

faithfulness.measure(test_case)
print("Faithfulness Score: ", faithfulness.score)
print("Faithfulness Reason: ", faithfulness.reason)

Output()

Output()

Answer Relevancy Score:  1.0
Answer Relevancy Reason:  The score is 1.00 because the output provides the perfect JSON response based on the given schema, which is exactly what is needed.


Faithfulness Score:  1.0
Faithfulness Reason:  Amazing! You've got no contradictions, which means the actual output perfectly aligns with the retrieval context. Keep up the great work!


In [None]:
#Answer Relevancy Score:  1.0
#Answer Relevancy Reason:  The score is 1.00 because the provided context is missing the speaker's proposed measures to lower prescription drug costs, making it impossible to provide a relevant response.

#Faithfulness Score:  1.0
#Faithfulness Reason:  Great job! The 'actual output' perfectly aligns with the information in the 'retrieval context'. Keep up the amazing work! 

# rerun
#Answer Relevancy Score:  1.0
#Answer Relevancy Reason:  The speaker proposes the following measures to lower prescription drug costs in America:  * Allow Medicare to negotiate drug prices.  * Cap out-of-pocket drug costs for seniors.  *  Limit price increases on drugs to the rate of inflation.  * Encourage generic drug competition.

#Faithfulness Score:  1.0
#Faithfulness Reason:  The score is 1.00 because there are no contradictions

In [46]:
# Try generating a synthetic dataset with DeepEval
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer

dataset = EvaluationDataset()
synthesizer = Synthesizer(model=custom_geminiflash, embedder=custom_geminiembeddings)
#dataset.generate_goldens_from_docs(
#    synthesizer=synthesizer,
#    document_paths=['Speeches/titleedits/state_of_the_union_042921.txt', 'Speeches/titleedits/state_of_the_union_030122.txt', 
#                    'Speeches/titleedits/state_of_the_union_020723.txt', 'Speeches/titleedits/state_of_the_union_030724.txt'],
#    max_goldens_per_document=3
#)

Output()

In [52]:
#dataset.save_as(file_type="csv", directory=".")

Evaluation dataset saved at ./20240909_121535.csv!


'./20240909_121535.csv'

In [56]:
pddataset = pd.read_csv('datasets/manual_dataset.csv', sep=',', index_col=None)

In [57]:
pddataset.head()

Unnamed: 0,Input,Actual_Output,Expected_Output,Retrieval_Context
0,How does the President plan to grow the econom...,,"In his first speech, the President proposes th...",
1,Given Biden's statement about bipartisan coope...,,Some examples of bipartisan cooperation during...,
2,What does the President say his administration...,,The President does not note any instances of p...,
3,How has the administration improved housing is...,,The American Rescue Plan provided guards to ke...,


In [58]:
query_engine = index.as_query_engine(similarity_top_k=10)
output = [query_engine.query(q) for q in pddataset['Input']]

In [59]:
# parse out new 'answer' and 'contexts' columns
answers_r = []
context_n = []
for i in output:
    answers_r.append(i.response)
    context_n.append([c.node.get_content() for c in i.source_nodes])

In [60]:
pddataset['Retrieval_Context'] = context_n
pddataset['Actual_Output'] = answers_r

In [61]:
pddataset.head()

Unnamed: 0,Input,Actual_Output,Expected_Output,Retrieval_Context
0,How does the President plan to grow the econom...,The President plans to grow the economy by foc...,"In his first speech, the President proposes th...","[And in the process, while this was all going ..."
1,Given Biden's statement about bipartisan coope...,The President highlights several instances of ...,Some examples of bipartisan cooperation during...,"[Investments in jobs and infrastructure, like ..."
2,What does the President say his administration...,The President mentions that the United States ...,The President does not note any instances of p...,"[And finally, the American Jobs Plan will be t..."
3,How has the administration improved housing is...,The administration has implemented several ini...,The American Rescue Plan provided guards to ke...,[I’ve cut red tape so more builders can get fe...


In [62]:
pddataset.to_csv('datasets/manual_dataset_complete.csv', index=False)

In [29]:
# stopped here 9/9, look into context vs retrieval_context
dataset2 = EvaluationDataset()
dataset2.add_test_cases_from_csv_file(
    file_path="datasets/synthesizer_speeches_dataset.csv",
    input_col_name="input",
    actual_output_col_name="actual_output",
    expected_output_col_name="expected_output",
    context_col_name="context",
    context_col_delimiter= ";",
#    retrieval_context_col_name="retrieval_context",
#    retrieval_context_col_delimiter= ";"
)

In [32]:
dataset2.test_cases

[LLMTestCase(input="How does the Violence Against Women Act, originally written by Biden, aim to prevent abusers from obtaining firearms through the 'boyfriend loophole'?", actual_output=nan, expected_output=nan, context=[' back.\n\nAnother thing: Let’s authorize the Violence Against Women Act, which has been law for 27 years.  (Applause.)  Twenty-seven years ago, I wrote it.  It’ll close the — the act that has to be authorized now will close the “boyfriend” loophole to keep guns out of the hands of abusers.  The court order said, “This is an abuser.  You can’t own a gun.”  It’s to close that loophole that existed. \n\nYou know, it’s estimated that 50 women are shot and killed by an intimate partner every month in America — 50 a month.  Let’s pass it and save some lives.  (Applause.)\n\nAnd I need not — I need not tell anyone this, but gun violence is becoming an epidemic in America.\n\nThe flag at the White House was still flying at half-mast for the 8 victims in the mass shooting in 

In [60]:
eval_goldens_recall = evaluate(test_cases=dataset.goldens, metrics=[contextual_recall], throttle_value=90)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 16 test case(s) in parallel: |                                 |  0% (0/16) [Time Taken: 24:01, ?test case/s]



Overall Metric Pass Rates








In [61]:
dataset.

EvaluationDataset(test_cases=[], goldens=[Golden(input="How does the Violence Against Women Act, originally written by Biden, aim to prevent abusers from obtaining firearms through the 'boyfriend loophole'?", actual_output=None, expected_output=None, context=[' back.\n\nAnother thing: Let’s authorize the Violence Against Women Act, which has been law for 27 years.  (Applause.)  Twenty-seven years ago, I wrote it.  It’ll close the — the act that has to be authorized now will close the “boyfriend” loophole to keep guns out of the hands of abusers.  The court order said, “This is an abuser.  You can’t own a gun.”  It’s to close that loophole that existed. \n\nYou know, it’s estimated that 50 women are shot and killed by an intimate partner every month in America — 50 a month.  Let’s pass it and save some lives.  (Applause.)\n\nAnd I need not — I need not tell anyone this, but gun violence is becoming an epidemic in America.\n\nThe flag at the White House was still flying at half-mast for 

In [37]:
# Additional ways to form json response

In [36]:
class User(BaseModel):
    name: str
    age: int

In [61]:
assert isinstance(test, User)
assert test.resp == "Tiki"
assert test.age == 25