In [26]:
# set do not track variable to RAGAS
# more info: https://github.com/explodinggradients/ragas/issues/49
import os
os.environ["RAGAS_DO_NOT_TRACK"] = "True"

In [2]:
import logging
import sys
import google.generativeai as genai
import pathlib
import textwrap
import ragas
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import pandas as pd
import faiss

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [4]:
ragas._analytics.do_not_track()

True

In [5]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
#result = llm.invoke("Write me a party invitation to a one year's old's dinosaur birthday party.")

I0000 00:00:1724417035.249609     591 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [6]:
# create document database
sotu = []
files = ["./Speeches/state_of_the_union_042921.txt", "./Speeches/state_of_the_union_030122.txt", "./Speeches/state_of_the_union_020723.txt", "./Speeches/state_of_the_union_030724.txt"]
for i in files:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [7]:
len(sotu)

1460

In [8]:
documents = [Document(text=line) for line in sotu]

In [9]:
documents[-1]

Document(id_='33453b68-a0a6-4927-ba92-32ebda65d7b5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='May God protect our troops.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [10]:
# following these tutorials
# https://learnbybuilding.ai/tutorials/rag-chatbot-on-podcast-llamaindex-faiss-openai
# https://medium.com/@saurabhgssingh/understanding-rag-building-a-rag-system-from-scratch-with-gemini-api-b11ad9fc1bf7

d = 768 # dimensions of ___, the embedding model that we're going to use
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [11]:
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
#vector = embeddings.embed_query("hello, world!")
#vector[:5]

In [12]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [13]:
#from llama_index import ServiceContext, set_global_service_context
#service_context = ServiceContext.from_defaults(llm=llm, embed_model=embededdings)
#set_global_service_context(service_context)

Settings.llm = llm

In [14]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)
## save index to disk
#index.storage_context.persist()
#index

In [15]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [16]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7f63bc166990>

In [19]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [None]:
query = "What has the President done related to healthcare?"
response = query_engine.query(query)

In [18]:
response.response

'The President has made significant efforts to improve healthcare access and affordability for Americans. They have expanded access to healthcare through the Affordable Care Act, lowered healthcare premiums for working families, and taken steps to reduce prescription drug costs. They have also re-ignited the Cancer Moonshot initiative, aimed at finding a cure for cancer. \n'

In [19]:
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7506474256515503 -> I’m pleased to say that more Americans have health insurance now than ever in history.
0.7599508166313171 -> During these 100 days, an additional 800,000 Americans enrolled in the Affordable Care Act when I established the special sign-up period to do that — 800,000 in that period.
0.7667317390441895 -> The Affordable Care Act has been a lifeline for millions of Americans, protecting people with preexisting conditions, protecting women’s health.  And the pandemic has demonstrated how badly — how badly it’s needed.  Let’s lower deductibles for working families on the Affordable Care — in the Affordable Care Act.  (Applause.)  And let’s lower prescription drug costs.  (Applause.)
0.796398401260376 -> A president, my predecessor, who failed the most basic duty. Any President owes the American people the duty to care.
0.801885724067688 -> Over one hundred million of you can no longer be denied health insurance because of pre-existing conditions.
0.8065693378448486 -> 

In [20]:
#result = genai.embed_content(
#    model="models/text-embedding-004",
#    content="What is the meaning of life?",
#    task_type="retrieval_document",
#    title="Embedding of single string")

In [20]:
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [21]:
query = "What does the President say about Congress during these 4 years?"
response = chat_engine.chat(query)

In [22]:
response.response

'The provided text doesn\'t offer any specific insights into the President\'s views on Congress over a 4-year period. \n\nHere\'s what we can glean from the excerpt:\n\n* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.\n* **He\'s optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation\'s future, which could be influenced by Congress\'s actions.\n\nHowever, the excerpt doesn\'t provide any specific criticisms or praise of Congress\'s actions during the past four years. To understand the President\'s full perspective, you\'d need to analyze more of his speeches, interviews, and official statements. \n'

In [23]:
print(response.response)

The provided text doesn't offer any specific insights into the President's views on Congress over a 4-year period. 

Here's what we can glean from the excerpt:

* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.
* **He's optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation's future, which could be influenced by Congress's actions.

However, the excerpt doesn't provide any specific criticisms or praise of Congress's actions during the past four years. To understand the President's full perspective, you'd need to analyze more of his speeches, interviews, and official statements. 



In [25]:
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.7652876377105713 -> Mr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans.
0.7749111652374268 -> Tonight I come to the same chamber to address the nation.
0.8328826427459717 -> A president, my predecessor, who failed the most basic duty. Any President owes the American people the duty to care.
0.8343376517295837 -> And I will always be a president for all Americans!
0.842502236366272 -> My Republican friends you owe it to the American people to get this bill done.
0.8441706895828247 -> And if my predecessor is watching instead of playing politics and pressuring members of Congress to block this bill, join me in telling Congress to pass it!
0.8499408960342407 -> And yes, my purpose tonight is to both wake up this Congress, and alert the American people that this is no ordinary moment either.
0.852456271648407 -> Meanwhile, my predecessor told the NRA he’s proud he did nothing on guns when he was President.
0.8647689819335938 -> I signed a bipartisan budget deal t

In [24]:
chat_engine.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What does the President say about Congress during these 4 years?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The provided text doesn\'t offer any specific insights into the President\'s views on Congress over a 4-year period. \n\nHere\'s what we can glean from the excerpt:\n\n* **He acknowledges the potential for bipartisan cooperation:**  He states, "To my Republican friends, if we could work together in the last Congress, there is no reason we can’t work together in this new Congress." This suggests he believes collaboration is possible.\n* **He\'s optimistic about the future:**  He declares "the State of the Union is strong" and attributes this to the strength of the American people. This implies a positive outlook on the nation\'s future, which could be influenced by Congress\'s actions.\n\nHowever, the excerpt doesn\'t provide any specific criticisms or praise of Congress\'s action

In [25]:
query2 = "The provided text are 4 years of speeches by the President. What does the President say about Congress during those 4 years?"
response2 = chat_engine.chat(query2)

In [26]:
print(response2.response)

You're right! I apologize for my previous response.  I was focusing too narrowly on the excerpt provided, not the entirety of the 4 years' worth of speeches. 

To accurately assess the President's views on Congress over four years, we would need to analyze a significant amount of text. Here's a framework for what we could look for:

* **Tone and Language:**  Does the President use positive or negative language when referring to Congress? Are there specific terms like "bipartisan," "gridlock," or "divided" that recur?
* **Specific Actions:** Does he praise or criticize specific actions taken by Congress, such as passing legislation, confirming appointments, or conducting investigations?
* **Calls to Action:**  Does he call on Congress to take specific actions or address particular issues?  
* **Legislative Successes:**  Does he highlight any legislation passed in collaboration with Congress? 
* **Themes and Priorities:**  Do his speeches reveal any overarching themes about how he views 

In [27]:
for node in response2.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

0.6518715620040894 -> Throughout our history, Presidents have come to this chamber to speak to Congress, to the nation, and to the world to declare war, to celebrate peace, to announce new plans and possibilities.
0.7356963157653809 -> Mr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans.
0.748851478099823 -> So on this night, in our 245th year as a nation, I have come to report on the State of the Union.
0.7538517713546753 -> Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.
0.7632147073745728 -> Because the soul of this nation is strong, because the backbone of this nation is strong, because the people of this nation are strong, the State of the Union is strong.
0.7771207690238953 -> So I have come here to fulfil my constitutional duty to report on the state of the union. And here is my report.
0.7832998037338257 -> And my report is this: the State o

In [30]:
# Start of RAGAS implementation

In [31]:
# Generate synthetic test data
# Note: When generating a synthetic test dataset, the columns generated are 
# 'question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'
# We have to generate the answer separately with our RAG, which then (obviously) generates new context used.
# My best guess is to use the context that was used to generate the answer for the metrics calculation.
# Thus, in the below example, I keep the old contexts column, but for evaluating RAG, I use the new context to calculate the metrics.
# I can't find good documentation to confirm this; the following issues are close:
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084

# Possible future edit:
# The best thing to do would be to generate the answer when creating the synthetic test dataset, but this is no longer done (perhaps done in RAGAS 1.0?)
# Ground truth is supposed to be the 'human' level answer vs the RAG answer

# Relevant warning from the same issue 1084:
# "Since you use the same LLM to generate your synthetic dataset ground_truth and your answer, 
# I think the results of this evaluation might be biased. I haven't realized a comparative study 
# but it might be an issue which could have an impact on your interpretation."

In [32]:
from langchain_community.document_loaders import DirectoryLoader

In [33]:
loader = DirectoryLoader("./Speeches")
documents = loader.load()

In [34]:
documents[3].metadata

{'source': 'Speeches/state_of_the_union_042921.txt'}

In [35]:
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [36]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

In [38]:
# generator with Gemini models
# Note: had to edit underlying RAGAS library (cloned locally, edited files, then pip -e installed locally) for this issue re: temperature:
# https://github.com/explodinggradients/ragas/pull/657/files

generator_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=10) #, temperature=0.7, timeout=180, transport="rest"
critic_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", timeout=60)
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") #transport="rest" #, request_options={"timeout": 10} #, request_options={"maxConcurrency": 5}

#LangchainLLMWrapper()
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [40]:
# RAGAS Gemini github issues
#https://github.com/explodinggradients/ragas/issues/678
#https://github.com/explodinggradients/ragas/pull/657/files

In [41]:
import typing as t

In [42]:
from ragas.run_config import RunConfig

# Increase the timeout settings
run_config = RunConfig(timeout=60, max_wait=180, max_workers=1, max_retries=10)  # Increase timeout to 180 seconds

In [29]:
# generate testset
def generate_testset_rate(docs):
    """
    Calls the model and embeddings with rate limit
    """
    testset = generator.generate_with_langchain_docs(docs, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, run_config=run_config, raise_exceptions=True, is_async=False)
    return testset

In [None]:
# The RAGAS internal RunConfig settings do a decent job at limiting the 429 resource exhausted warnings when max_workers=1
# Tried ratelimit and backoff libraries in Python... didn't affect it enough, got so many warnings that it wouldn't finish
testset = generate_testset_rate(documents) 

In [34]:
testset_pd = testset.to_pandas()

In [35]:
testset_pd

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True


In [36]:
# testset_pd.to_csv('testset_flash_pro15.csv', index=False)

In [46]:
testset_pd = pd.read_csv("testset_flash_pro15.csv", index_col = None)

In [47]:
# use if imported testset_pd from csv
# this is a fix for 'contexts' column being saved as a string; needs to be a list
import ast
testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)

In [48]:
testset_pd

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True


In [49]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from datasets import Dataset

In [51]:
# per this ragas thread, need to convert pandas testset to Dataset format for evaluate to work
# https://github.com/explodinggradients/ragas/issues/803
testset_ds = Dataset.from_pandas(testset_pd)

In [52]:
testset_ds

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'],
    num_rows: 10
})

In [54]:
# generate answer column, per these two issues
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084#issuecomment-2248219601
query_engine = index.as_query_engine(similarity_top_k=10)
#query = "What has the President done related to healthcare?"
#response = query_engine.query(query)

In [None]:
answers = [query_engine.query(q) for q in testset_pd['question']]

In [79]:
# parse out new 'answer' and 'contexts' columns
answers_r = []
context_n = []
for i in answers:
    answers_r.append(i.response)
    context_n.append([c.node.get_content() for c in i.source_nodes])

In [86]:
testset_pd = testset_pd.rename(columns={"contexts":"contexts_gt"})

In [88]:
testset_pd['contexts'] = context_n

In [89]:
testset_pd['answer'] = answers_r

In [129]:
#testset_pd.to_csv('testset_answer_newcontext_flash_pro15.csv', index=False)
testset_pd = pd.read_csv("testset_answer_newcontext_flash_pro15.csv", index_col = None)

In [144]:
# use if imported testset_pd from csv
# this is a fix for 'contexts' column being saved as a string; needs to be a list
import ast
testset_pd['contexts'] = testset_pd['contexts'].apply(ast.literal_eval)
testset_pd['contexts_gt'] = testset_pd['contexts_gt'].apply(ast.literal_eval)

In [153]:
testset_pd

Unnamed: 0,question,contexts_gt,ground_truth,evolution_type,metadata,episode_done,contexts,answer
0,What does the author argue is under attack reg...,[ workers they need and families don’t wait de...,The author argues that the constitutional righ...,simple,[{'source': 'Speeches/state_of_the_union_03012...,True,[The constitutional right affirmed in Roe v. W...,The author argues that the constitutional righ...
1,"In the speech from President Biden, why has ""t...",[ more than a million dollars a year and pay a...,"President Biden states that ""trickle-down econ...",simple,[{'source': 'Speeches/state_of_the_union_04292...,True,"[My fellow Americans, trickle-down — trickle-d...","The speaker believes that ""trickle-down econom..."
2,"What is the speaker's stance on the ""Buy Ameri...",[ right here in America where they belong!\n\n...,"The speaker believes in the ""Buy American"" pol...",simple,[{'source': 'Speeches/state_of_the_union_03072...,True,[Past administrations including my predecessor...,"The speaker strongly advocates for the ""Buy Am..."
3,What measures does the speaker propose to lowe...,"[- a parent, a spouse, or child.\n\nAnd fourth...",The speaker proposes that Medicare should be g...,simple,[{'source': 'Speeches/state_of_the_union_04292...,True,[Let’s do what we’ve always talked about for a...,The speaker proposes giving Medicare the power...
4,What are President Biden's proposals for addre...,"[omes destroyed, neighborhoods in rubble, citi...",President Biden proposes a six-week ceasefire ...,simple,[{'source': 'Speeches/state_of_the_union_03072...,True,"[As we look to the future, the only real solut...",The President proposes a two-state solution as...
5,How does Biden compare a threat to American de...,"[’s argue over it, let’s debate it, but let’s ...",The context discusses the threat to American d...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True,[January 6th and the lies about the 2020 elect...,Empty Response
6,What gun violence prevention actions did Presi...,[ job at another burger place to make a couple...,President Biden mentioned the need to ban assa...,reasoning,[{'source': 'Speeches/state_of_the_union_02072...,True,[And I will do everything in my power to prote...,President Biden called for Congress to pass un...
7,What economic policy did President Biden rejec...,"[- a parent, a spouse, or child.\n\nAnd fourth...",President Biden rejected trickle-down economic...,multi_context,[{'source': 'Speeches/state_of_the_union_04292...,True,"[My fellow Americans, trickle-down — trickle-d...",The speaker rejected trickle-down economics an...
8,How does the President's plan address societal...,[ workers they need and families don’t wait de...,The President's plan addresses societal issues...,multi_context,[{'source': 'Speeches/state_of_the_union_03012...,True,[My administration is providing assistance wit...,The President's plan addresses societal issues...
9,How will Biden address China's trade practices...,[ more than a million dollars a year and pay a...,Biden stated that he will defend America's int...,reasoning,[{'source': 'Speeches/state_of_the_union_04292...,True,[I also told President Xi that we’ll maintain ...,The speaker will address unfair trade practice...


In [154]:
testset_ds = Dataset.from_pandas(testset_pd.drop("contexts_gt", axis=1))

In [155]:
testset_ds

Dataset({
    features: ['question', 'ground_truth', 'evolution_type', 'metadata', 'episode_done', 'contexts', 'answer'],
    num_rows: 10
})

In [150]:
# Note: I'm using the normal LLM, not the RAG context-loaded LLM
# It appears that evaluate (below) re-runs the query and gives new answers and contexts anyway...
# Watch this issue: https://github.com/explodinggradients/ragas/issues/1211

ragas_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=10)
#embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [135]:
# Increase the timeout settings
run_config = RunConfig(timeout=60, max_wait=180, max_workers=1, max_retries=10)  # Increase timeout to 180 seconds

In [None]:
# Optional parameter: 
# in_ci: bool, Whether the evaluation is running in CI or not. 
# If set to True then some metrics will be run to increase the reproducability of the evaluations. 
# This will increase the runtime and cost of evaluations. Default is False.
evalresult = evaluate(
    testset_ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [None]:
# Note: received warning for the answer where there was no response from the llm, definitely reduced faithfulness score

# Results:
# Using contexts generated when produced answers from LLM:
# new testset_answer_newcontext_flash_pro15.csv result, with contexts_old col removed
# {'context_precision': 0.4171, 'faithfulness': 0.9167, 'answer_relevancy': 0.6509, 'context_recall': 0.8000}
# reran
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}

# Compared to using contexts generated for ground truth (probably not correct):
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}

In [None]:
# Evaluation results on metrics:

# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics
# I don't have example ranges to compare anything to, so below is my best guess.

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# 0.9167 - 1.0000 indicates that the LLM is staying true to the facts provided in the context for answering the question.
# There is another Faithfulness metric: from ragas.metrics import FaithulnesswithHHEM
# This uses a huggingface model to help detect hallucination : https://huggingface.co/vectara/hallucination_evaluation_model
# See below for code : {'faithfulness_with_hhem': 0.6319} 
# This doesn't really agree with the RAGAS faithfulness score... may need to dive in further another time.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# At 0.4171 - 0.4676, suggests that the context isn't particularly relevant to the question.
# Answer_relevancy - Measures how relevant the answer is to the question.
# 0.6509 - 0.6515 seems moderately low, just going off of the number.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.
# 0.8 indicates that the llm context is decently good and can typically answer the question or most of it. 

In [120]:
# Example result:
# {'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}
evalresult

{'context_precision': 0.4676, 'faithfulness': 1.0000, 'answer_relevancy': 0.6515, 'context_recall': 0.8000}

In [105]:
# Test:
# compare to using contexts_old col
testset_ds_oldcontext = Dataset.from_pandas(testset_pd.drop("contexts", axis=1).rename(columns={'contexts_old':'contexts'}))

In [112]:
ragas_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", timeout=10)

In [None]:
evalresult_old2 = evaluate(
    testset_ds_oldcontext,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

In [None]:
# new testset_answer_newcontext_flash_pro15.csv result, using old contexts
# {'context_precision': 0.7500, 'faithfulness': 0.7392, 'answer_relevancy': 0.6041, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.8500, 'faithfulness': 0.7123, 'answer_relevancy': 0.5934, 'context_recall': 1.0000}
# reran:
# {'context_precision': 0.7500, 'faithfulness': 0.6556, 'answer_relevancy': 0.5638, 'context_recall': 1.0000}
evalresult_old2

In [156]:
from ragas.metrics import FaithulnesswithHHEM
faithfulness_with_hhem = FaithulnesswithHHEM()

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


In [157]:
# Note: There's a message on HuggingFace about the token indices sequence length error being normal and an artifact; thus, ignoring the below error
# https://huggingface.co/vectara/hallucination_evaluation_model
result_faithfulness_hhem = evaluate(
    testset_ds,
    metrics=[faithfulness_with_hhem],
    llm = ragas_llm,
    embeddings=embeddings, 
    run_config=run_config
)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)


Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


In [158]:
# with context from answer generation:
# {'faithfulness_with_hhem': 0.6319}
# testing: with context from ground truth/synthetic testset generation
# {'faithfulness_with_hhem': 0.5241}
# this seems to agree with the RAGAS faithfulness score in that answers seem to be partially made up.
result_faithfulness_hhem

{'faithfulness_with_hhem': 0.6319}

In [None]:
# RAGAS metrics guide: https://docs.ragas.io/en/latest/concepts/metrics/index.html#ragas-metrics

# Faithfulness - Measures the factual consistency of the answer to the context based on the question.
# Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline.
# Answer_relevancy - Measures how relevant the answer is to the question.
# Context_recall - Measures the retriever’s ability to retrieve all necessary information required to answer the question.