In [1]:
# set do not track variable to RAGAS
# more info: https://github.com/explodinggradients/ragas/issues/49
import os
os.environ["RAGAS_DO_NOT_TRACK"] = "True"

In [2]:
import logging
import sys
import google.generativeai as genai
import pathlib
import textwrap
import ragas
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [4]:
ragas._analytics.do_not_track()

True

In [7]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
#result = llm.invoke("Write me a party invitation to a one year's old's dinosaur birthday party.")

I0000 00:00:1724359789.530360     525 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [8]:
# create document database
sotu = []
files = ["./Speeches/state_of_the_union_042921.txt", "./Speeches/state_of_the_union_030122.txt", "./Speeches/state_of_the_union_020723.txt", "./Speeches/state_of_the_union_030724.txt"]
for i in files:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [9]:
len(sotu)

1460

In [10]:
documents = [Document(text=line) for line in sotu]

In [11]:
documents[-1]

Document(id_='faf5696f-3134-41ad-86d5-b92e38911d62', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [12]:
# following these tutorials
# https://learnbybuilding.ai/tutorials/rag-chatbot-on-podcast-llamaindex-faiss-openai
# https://medium.com/@saurabhgssingh/understanding-rag-building-a-rag-system-from-scratch-with-gemini-api-b11ad9fc1bf7

d = 768 # dimensions of ___, the embedding model that we're going to use
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [13]:
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
#vector = embeddings.embed_query("hello, world!")
#vector[:5]

In [14]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [15]:
#from llama_index import ServiceContext, set_global_service_context
#service_context = ServiceContext.from_defaults(llm=llm, embed_model=embededdings)
#set_global_service_context(service_context)

Settings.llm = llm

In [16]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)
## save index to disk
#index.storage_context.persist()
#index

In [17]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [18]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7f5c40847990>

In [19]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [None]:
query = "What has the President done related to healthcare?"
response = query_engine.query(query)

In [18]:
response.response

'The President has made significant efforts to improve healthcare access and affordability for Americans. They have expanded access to healthcare through the Affordable Care Act, lowered healthcare premiums for working families, and taken steps to reduce prescription drug costs. They have also re-ignited the Cancer Moonshot initiative, aimed at finding a cure for cancer. \n'

In [19]:
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7506474256515503 -> I’m pleased to say that more Americans have health insurance now than ever in history.
0.7599508166313171 -> During these 100 days, an additional 800,000 Americans enrolled in the Affordable Care Act when I established the special sign-up period to do that — 800,000 in that period.
0.7667317390441895 -> The Affordable Care Act has been a lifeline for millions of Americans, protecting people with preexisting conditions, protecting women’s health.  And the pandemic has demonstrated how badly — how badly it’s needed.  Let’s lower deductibles for working families on the Affordable Care — in the Affordable Care Act.  (Applause.)  And let’s lower prescription drug costs.  (Applause.)
0.796398401260376 -> A president, my predecessor, who failed the most basic duty. Any President owes the American people the duty to care.
0.801885724067688 -> Over one hundred million of you can no longer be denied health insurance because of pre-existing conditions.
0.8065693378448486 -> 

In [20]:
#result = genai.embed_content(
#    model="models/text-embedding-004",
#    content="What is the meaning of life?",
#    task_type="retrieval_document",
#    title="Embedding of single string")

In [20]:
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [21]:
query = "What does the President say about Congress during these 4 years?"
response = chat_engine.chat(query)

In [22]:
response.response

'The provided text doesn\'t offer any specific insights into the President\'s views on Congress over a 4-year period. \n\nHere\'s what we can glean from the excerpt:\n\n* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.\n* **He\'s optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation\'s future, which could be influenced by Congress\'s actions.\n\nHowever, the excerpt doesn\'t provide any specific criticisms or praise of Congress\'s actions during the past four years. To understand the President\'s full perspective, you\'d need to analyze more of his speeches, interviews, and official statements. \n'

In [23]:
print(response.response)

The provided text doesn't offer any specific insights into the President's views on Congress over a 4-year period. 

Here's what we can glean from the excerpt:

* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.
* **He's optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation's future, which could be influenced by Congress's actions.

However, the excerpt doesn't provide any specific criticisms or praise of Congress's actions during the past four years. To understand the President's full perspective, you'd need to analyze more of his speeches, interviews, and official statements. 



In [25]:
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7652876377105713 -> Mr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans.
0.7749111652374268 -> Tonight I come to the same chamber to address the nation.
0.8328826427459717 -> A president, my predecessor, who failed the most basic duty. Any President owes the American people the duty to care.
0.8343376517295837 -> And I will always be a president for all Americans!
0.842502236366272 -> My Republican friends you owe it to the American people to get this bill done.
0.8441706895828247 -> And if my predecessor is watching instead of playing politics and pressuring members of Congress to block this bill, join me in telling Congress to pass it!
0.8499408960342407 -> And yes, my purpose tonight is to both wake up this Congress, and alert the American people that this is no ordinary moment either.
0.852456271648407 -> Meanwhile, my predecessor told the NRA he’s proud he did nothing on guns when he was President.
0.8647689819335938 -> I signed a bipartisan budget deal t

In [24]:
chat_engine.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What does the President say about Congress during these 4 years?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The provided text doesn\'t offer any specific insights into the President\'s views on Congress over a 4-year period. \n\nHere\'s what we can glean from the excerpt:\n\n* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.\n* **He\'s optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation\'s future, which could be influenced by Congress\'s actions.\n\nHowever, the excerpt doesn\'t provide any specific criticisms or praise of Congress\'s action

In [25]:
query2 = "The provided text are 4 years of speeches by the President. What does the President say about Congress during those 4 years?"
response2 = chat_engine.chat(query2)

In [26]:
print(response2.response)

You're right! I apologize for my previous response.  I was focusing too narrowly on the excerpt provided, not the entirety of the 4 years' worth of speeches. 

To accurately assess the President's views on Congress over four years, we would need to analyze a significant amount of text. Here's a framework for what we could look for:

* **Tone and Language:**  Does the President use positive or negative language when referring to Congress? Are there specific terms like "bipartisan," "gridlock," or "divided" that recur?
* **Specific Actions:** Does he praise or criticize specific actions taken by Congress, such as passing legislation, confirming appointments, or conducting investigations?
* **Calls to Action:**  Does he call on Congress to take specific actions or address particular issues?  
* **Legislative Successes:**  Does he highlight any legislation passed in collaboration with Congress? 
* **Themes and Priorities:**  Do his speeches reveal any overarching themes about how he views 

In [27]:
for node in response2.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.6518715620040894 -> Throughout our history, Presidents have come to this chamber to speak to Congress, to the nation, and to the world to declare war, to celebrate peace, to announce new plans and possibilities.
0.7356963157653809 -> Mr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans.
0.748851478099823 -> So on this night, in our 245th year as a nation, I have come to report on the State of the Union.
0.7538517713546753 -> Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.
0.7632147073745728 -> Because the soul of this nation is strong, because the backbone of this nation is strong, because the people of this nation are strong, the State of the Union is strong.
0.7771207690238953 -> So I have come here to fulfil my constitutional duty to report on the state of the union. And here is my report.
0.7832998037338257 -> And my report is this: the State o

In [20]:
# Start of RAGAS implementation

In [21]:
# Generate synthetic test data

In [22]:
from langchain_community.document_loaders import DirectoryLoader

In [23]:
loader = DirectoryLoader("./Speeches")
documents = loader.load()

In [24]:
documents[3].metadata

{'source': 'Speeches/state_of_the_union_042921.txt'}

In [25]:
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [26]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
#from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [27]:
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
import google.api_core.exceptions as google_exceptions



In [28]:
# generator with Gemini models
# Note: had to edit underlying RAGAS library (cloned locally, edited files, then pip -e installed locally) for this issue re: temperature:
# https://github.com/explodinggradients/ragas/pull/657/files

# ToDo: Can you use the same model for generator and critic? These should be different than my main model with RAG, right?
# I've seen third party examples with the same... I did it to start with testing
# ToDo Future: Redo testset generation with different models than model for RAG
generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=10) #, temperature=0.7, timeout=180, transport="rest"
critic_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", timeout=60)
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") #transport="rest" #, request_options={"timeout": 10} #, request_options={"maxConcurrency": 5}

#LangchainLLMWrapper()
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [29]:
# Possible future re-do to have fewer errors on quota being hit: work on retry / query limits for gemini to have this run
# https://github.com/google-gemini/cookbook/blob/dc168d968b61b0f1603b743bc2deb73723a159f7/quickstarts/Error_handling.ipynb
# https://medium.com/google-cloud/how-to-add-rate-limit-and-progress-bar-to-google-cloud-generative-ai-api-calls-502b89d35de8

In [30]:
# RAGAS Gemini github issues
#https://github.com/explodinggradients/ragas/issues/678
#https://github.com/explodinggradients/ragas/pull/657/files

In [31]:
import typing as t

In [32]:
from ragas.run_config import RunConfig
#from ragas.llms.base import llm_factory

# Increase the timeout settings
run_config = RunConfig(timeout=60, max_wait=180, max_workers=1, max_retries=10)  # Increase timeout to 120 seconds

# Create the LLM with the increased timeout settings
#llm = llm_factory(run_config=run_config)

In [33]:
# generate testset
#testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
#@on_exception(expo, google_exceptions.ResourceExhausted, max_tries=10) # if we receive exceptions from Google API, retry

#CALLS = 15
#RATE_LIMIT = 60
#@on_exception(expo, google_exceptions.ResourceExhausted, max_tries=10)
#@sleep_and_retry # If there are more request to this function than rate, sleep shortly
#@limits(calls=CALLS, period=RATE_LIMIT)
def generate_testset_rate(docs):
    """
    Calls the model and embeddings with rate limit
    """
    testset = generator.generate_with_langchain_docs(docs, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, run_config=run_config, raise_exceptions=True, is_async=False)
    return testset

In [32]:
# stopped here 8/21: https://github.com/explodinggradients/ragas/issues/624 stopped here, changed to these settings, will try again tomorrow 8/22
# ToDo: Still generated lots of 429 resource exhausted requests... but it did help and everything finished. Need to reduce # requests more still
testset = generate_testset_rate(documents) #make_request_with_backoff() 

embedding nodes:   0%|          | 0/80 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:ragas.testset.evolutions:seed question generated: What specific actions does the speaker propose to address gun violence in the United States? 

seed question generated: What specific actions does the speaker propose to address gun violence in the United States? 

INFO:ragas.testset.evolutions:rewritten question: "In President Biden's State of the Union address, what actions for gun violence prevention are mentioned after the speaker discusses the passage of the recent gun safety law?"
rewritten question: "In President Biden's State of the Union address, what actions for gun violence prevention are mentioned after the speaker discusses the passage of the recent gun safety law?"
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised Re

In [33]:
testset

TestDataset(test_data=[DataRow(question='What does the author argue is under attack regarding reproductive rights and what steps does the author call for to address this issue? \n', contexts=[' workers they need and families don’t wait decades to reunite.\n\nIt’s not only the right thing to do—it’s the economically smart thing to do.\n\nThat’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce.\n\nLet’s get it done once and for all.\n\nAdvancing liberty and justice also requires protecting the rights of women.\n\nThe constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.\n\nIf we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America.\n\nAnd for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state 

In [34]:
testset_pd = testset.to_pandas()

In [35]:
testset_pd

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True


In [36]:
testset_pd.to_csv('testset_flash_pro15.csv', index=False)

In [37]:
for i in testset_pd:
    print(testset_pd[i])

0    What does the author argue is under attack reg...
1    In the speech from President Biden, why has "t...
2    What is the speaker's stance on the "Buy Ameri...
3    What measures does the speaker propose to lowe...
4    What are President Biden's proposals for addre...
5    How does Biden compare a threat to American de...
6    What gun violence prevention actions did Presi...
7    What economic policy did President Biden rejec...
8    How does the President's plan address societal...
9    How will Biden address China's trade practices...
Name: question, dtype: object
0    [ workers they need and families don’t wait de...
1    [ more than a million dollars a year and pay a...
2    [ right here in America where they belong!\n\n...
3    [- a parent, a spouse, or child.\n\nAnd fourth...
4    [omes destroyed, neighborhoods in rubble, citi...
5    [’s argue over it, let’s debate it, but let’s ...
6    [ job at another burger place to make a couple...
7    [- a parent, a spouse, or chil

In [34]:
testset_pd = pd.read_csv("testset_answer_flash_pro15.csv", index_col = None)

In [35]:
import ast
# use if imported testset_pd from csv
# this is a fix for 'contexts' column being saved as a string; needs to be a list
testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [36]:
testset_pd

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done,answer
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True,The author argues that the constitutional righ...
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True,"According to the speech, trickle-down economic..."
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True,"The speaker believes that the ""Buy American"" p..."
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True,The speaker proposes giving Medicare the power...
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True,The President proposes a two-state solution as...
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True,Empty Response
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True,President Biden mentioned the following gun vi...
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True,"The President rejected trickle-down economics,..."
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True,The President's plan addresses societal issues...
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True,The speaker will maintain a strong military pr...


In [37]:
testset_pd['contexts'][0]

[' workers they need and families don’t wait decades to reunite.\n\nIt’s not only the right thing to do—it’s the economically smart thing to do.\n\nThat’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce.\n\nLet’s get it done once and for all.\n\nAdvancing liberty and justice also requires protecting the rights of women.\n\nThe constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.\n\nIf we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America.\n\nAnd for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong.\n\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so y

In [38]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

In [39]:
from ragas import evaluate
from datasets import Dataset

In [40]:
# per this ragas thread, need to convert pandas testset to Dataset format for evaluate to work
# https://github.com/explodinggradients/ragas/issues/803
testset_ds = Dataset.from_pandas(testset_pd)

In [41]:
testset_ds

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done', 'answer'],
    num_rows: 10
})

In [43]:
testset_pd['question'][5]

"How does Biden compare a threat to American democracy to terrorism, and what's his plan to address it?"

In [48]:
# generate answer column, per these two issues
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084#issuecomment-2248219601
query_engine = index.as_query_engine(similarity_top_k=10)
#query = "What has the President done related to healthcare?"
#response = query_engine.query(query)

In [50]:
# ToDo: use [llm.invoke(question).content for question in generator["question"]]
answers = [query_engine.query(q) for q in testset_pd['question']]

  warn_deprecated(


In [51]:
answers_r = []
for i in answers:
    answers_r.append(i.response)

In [52]:
answers_r

['The author argues that the constitutional right to choose, established by Roe v. Wade, is under attack. They call for Congress to codify Roe v. Wade and restore the right to choose, protecting access to healthcare and advancing maternal health care. \n',
 'According to the speech, trickle-down economics has failed to work because it has led to lower wages, bigger deficits, and a wider gap between the wealthy and everyone else. Instead, the speaker advocates for an economic approach that focuses on building the economy from the bottom up and the middle out. \n',
 'The speaker believes that the "Buy American" policy should be strictly enforced and that past administrations have failed to do so. They argue that the policy has been in place since the 1930s and that taxpayer dollars should be used to purchase American products, which in turn will create American jobs. \n',
 'The speaker proposes giving Medicare the power to negotiate lower drug prices, capping prescription drug costs at $

In [53]:
testset_pd['answer'] = answers_r

In [79]:
for i in testset_pd:
    print(testset_pd[i][1])

In the speech from President Biden, why has "trickle-down economics" not worked, and what economic approach does he advocate for instead?
[' more than a million dollars a year and pay a lower tax rate on their capital gains than Americans who receive a paycheck. We’re only going to affect three tenths of 1 percent of all Americans by that action. Three tenths of 1 percent.\n\nAnd the IRS is going to crack down on millionaires and billionaires who cheat on their taxes. It’s estimated to be billions of dollars by think tanks that are left, right, and center.\n\nI’m not looking to punish anybody. But I will not add a tax burden — an additional tax burden to the middle class in this country. They’re already paying enough. I believe what I propose is fair — (applause) — fiscally responsible, and it raises revenue to pay for the plans I have proposed, and will create millions of jobs that will grow the economy and enhance our financial standing in the country.\n\nWhen you hear someone say th

In [62]:
testset_pd.to_csv('testset_answer_flash_pro15.csv', index=False)

In [81]:
# ToDo: See whether need to provide RAG llm with just the context in the dataset vs full context...... look into example #2 ([1]) as a test case from testset.csv

In [42]:
testset_ds = Dataset.from_pandas(testset_pd)

In [43]:
testset_ds

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done', 'answer'],
    num_rows: 10
})

In [44]:
ragas_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=10)
#embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [45]:
run_config

RunConfig(timeout=60, max_retries=10, max_wait=180, max_workers=1, exception_types=(<class 'Exception'>,), log_tenacity=False, seed=42)

In [60]:
# ToDo: Need to add rate limiting
result = evaluate(
    testset_ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

No statements were generated from the answer.
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying 

In [61]:
# new testset_flash_pro15.csv result
# questions generated by flash 1.5, critiqued by pro 1.5
# Note: received warning for the answer where there was no response from the llm, definitely reduced faithfulness score
result

{'context_precision': 0.7500, 'faithfulness': 0.5944, 'answer_relevancy': 0.5821, 'context_recall': 1.0000}

In [None]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# 0.5944 indicates there's some answer parts being made up...
# which given that supposedly there's always the correct answer in the context, means the LLM is taking some liberties.
# There is another Faithfulness metric: from ragas.metrics import FaithulnesswithHHEM
# See below for code : {'faithfulness_with_hhem': 0.5241} 
# THis seems to agree with the RAGAS faithfulness score in that answers seem to be partially made up. 
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# At 0.75, seems fairly relevant but could be better. In my review, the context for each question is pretty long... 
# may just be because these are speeches? There's a lot of unneeded context in each one though.
# Answer_relevancy - Measures how relevant the answer is to the question.
# 0.5821 seems low, just going off of the number.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.
# This being 1.0 means that the context always contains the answer to the question... that could be an artifact of the question generation process.

In [46]:
from ragas.metrics import FaithulnesswithHHEM
faithfulness_with_hhem = FaithulnesswithHHEM()

config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

configuration_hhem_v2.py:   0%|          | 0.00/760 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/vectara/hallucination_evaluation_model:
- configuration_hhem_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


modeling_hhem_v2.py:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/vectara/hallucination_evaluation_model:
- modeling_hhem_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [47]:
# Note: There's a message on HuggingFace about the token indices sequence length error being normal and an artifact; thus, ignoring it
# https://huggingface.co/vectara/hallucination_evaluation_model
result_faithfulness_hhem = evaluate(
    testset_ds,
    metrics=[faithfulness_with_hhem],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (997 > 512). Running this sequence through the model will result in indexing errors


ERROR:ragas.executor:Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)


In [49]:
# {'faithfulness_with_hhem': 0.5241}
# this seems to agree with the RAGAS faithfulness score in that answers seem to be partially made up. :(
result_faithfulness_hhem

{'faithfulness_with_hhem': 0.5241}

In [60]:
# old testset.csv result
# everything generated by flash 1.5
result

{'context_precision': 0.8571, 'faithfulness': 0.8833, 'answer_relevancy': 0.7234, 'context_recall': 1.0000}

In [50]:
# 8/22 Note: From my review of some of the question/groundtruth/context/answers, 
# I think that the LLM is being given the full context of all speeches for making its answers vs just the context in the dataset.
# I need to generate a new round of answers with just context given in the dataset.
# This is probably best back at the testset generation phase..... Start here tomorrow.

# Though there is a new issue about context/answers being recalculated, so maybe this doesn't even matter ;) lol
# Watch this issue: https://github.com/explodinggradients/ragas/issues/1211

In [None]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.