In [None]:
import os
import logging
import sys
from IPython.display import display, Markdown
import pandas as pd
from typing import List
from pydantic import BaseModel, ConfigDict
import instructor
from datasets import Dataset

import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

import deepeval
from deepeval.models import DeepEvalBaseLLM, DeepEvalBaseEmbeddingModel
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer
from deepeval import evaluate
from deepeval.evaluate import TestResult, print_test_result
from deepeval.metrics import (
    AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    FaithfulnessMetric
)
from deepeval.metrics.ragas import (
    RagasMetric,
    RAGASAnswerRelevancyMetric,
    RAGASFaithfulnessMetric, 
    RAGASContextualRecallMetric,
    RAGASContextualPrecisionMetric,
    RAGASContextualRelevancyMetric
) 


In [2]:
# Environmental variable to opt out of DeepEval tracking telemetry data
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [3]:
deepeval.telemetry_opt_out()

True

In [4]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [5]:
# set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [7]:
# create document database
# using 4 State of the Union speeches, all text from whitehouse.gov briefing room speeches posted online, including a title with the date of the speech
# Example from 2024:
# https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/03/07/remarks-of-president-joe-biden-state-of-the-union-address-as-prepared-for-delivery-2/
sotu = []
newfiles = ["./Speeches/titleedits/state_of_the_union_042921.txt", "./Speeches/titleedits/state_of_the_union_030122.txt", "./Speeches/titleedits/state_of_the_union_020723.txt", "./Speeches/titleedits/state_of_the_union_030724.txt"]
for i in newfiles:
    with open(i) as file:
        for line in file:
            nl = line.rstrip()
            if nl != '':
                sotu.append(nl)

In [8]:
documents = [Document(text=line) for line in sotu]

In [9]:
documents[0]

Document(id_='febd8912-b07e-4827-bea9-b0360604083f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='State of the Union Address given by President Biden on April 29, 2021', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [7]:
# Set up the faiss index
d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained)

True


In [8]:
# set up the embeddings
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # optional: task_type="RETRIEVAL_DOCUMENT"
Settings.embed_model = doc_embeddings
Settings.llm = llm

In [9]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [16]:
## uncomment for when you need to re-embed and vectorize documents
## otherwise, doing local loading below
#vector_store = FaissVectorStore(faiss_index=faiss_index)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)
#index = VectorStoreIndex.from_documents(
#    documents, storage_context=storage_context, show_progress=True
#)


Parsing nodes:   0%|          | 0/1464 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1464 [00:00<?, ?it/s]

In [17]:
#index.index_id

'3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5'

In [18]:
## save index to disk
#index.storage_context.persist(persist_dir="./storage")
#index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7f128c3bdb10>

In [19]:
#index.index_id

'3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5'

In [None]:
# choose index for answer generation:
# index id '3d3c99c5-aa1c-42d7-a9ce-c4bb12fbc6d5' in persist_dir "./storage" uses all 4 of the speeches with a title that includes the date it was given
# index id 'c9cfc851-156f-41a3-96a9-ff1516a65a8e' in persist_dir "./storage2021" just uses the 2021 speech with a title that includes the date it was given
# index id '354f0f60-3eb3-46c1-a31a-b854a7f4536c' in persist_dir "./storage2022" just uses the 2022 speech with a title that includes the date it was given
# index id '48050557-05c2-4e05-a610-75aa414348a7' in persist_dir "./storage2023" just uses the 2023 speech with a title that includes the date it was given
# index id 'cab605dd-ff55-4514-9571-c682fc1fd4b2' in persist_dir "./storage2024" just uses the 2024 speech with a title that includes the date it was given

In [10]:
# load index from disk
indexid = 'cab605dd-ff55-4514-9571-c682fc1fd4b2'
storagedir = "./storage2024"
vector_store = FaissVectorStore.from_persist_dir(storagedir)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir=storagedir
)

index = load_index_from_storage(storage_context=storage_context, index_id=indexid)

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage2024/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage2024/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading indices with ids: ['cab605dd-ff55-4514-9571-c682fc1fd4b2']
Loading indices with ids: ['cab605dd-ff55-4514-9571-c682fc1fd4b2']


In [11]:
# set up query and chat engines
query_engine = index.as_query_engine(similarity_top_k=10)
chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [12]:
# DeepEval requires a json response. In practice, this has led to malformed json returned from the llm, even with as simple of a schema as this
class Response(BaseModel):
    response: str

In [13]:
# Non Open-AI requiere a custom LLM class for using DeepEval
class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")
        model_config = ConfigDict(protected_namespaces=())

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel: 
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Gemini 1.5 Flash"

In [14]:
# similarly, a custom embedding model class is required for non Open-AI embeddings
class CustomGeminiEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        model_config  = ConfigDict(protected_namespaces=())

    def load_model(self):
        return GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"
        )

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom Gemini Embeddings"

In [15]:
custom_geminiflash = CustomGeminiFlash()
custom_geminiembeddings = CustomGeminiEmbeddingModel()

In [16]:
import time

0
1


In [20]:
# Generate a synthetic dataset of "Goldens" (aka a dataset with 'input', 'context', 'source_file' columns -- not 'Retrieval_Context') with DeepEval
for i in range(2):
    dataset = EvaluationDataset()
    synthesizer = Synthesizer(model=custom_geminiflash, embedder=custom_geminiembeddings)
    dataset.generate_goldens_from_docs(
        synthesizer=synthesizer,
        document_paths=#['Speeches/titleedits/state_of_the_union_042921.txt'],
                     #['Speeches/titleedits/state_of_the_union_030122.txt'], 
                      #  ['Speeches/titleedits/state_of_the_union_020723.txt'],
                     ['Speeches/titleedits/state_of_the_union_030724.txt'],
        max_goldens_per_document=5,
        include_expected_output=True
    )
    
    dataset.save_as(file_type="csv", directory=".")
    print(i)
    time.sleep(61)

Output()

Evaluation dataset saved at ./20241108_164303.csv!
0


Output()

Evaluation dataset saved at ./20241108_164413.csv!
1


In [22]:
# after dataset is generated, need to generate the answer column

In [21]:
# Below code uses the resulting testset without generated answers to generate new answers

testset_pd = pd.read_csv("./20241108_164413.csv", index_col = None)

In [None]:
testset_pd 

In [23]:
testset_pd = testset_pd.rename(columns={"input": "Query", "actual_output": "Answer", "expected_output": "Expected_Output", "context": "Contexts", "source_file": "Source_File"})

In [24]:
testset_pd

Unnamed: 0,Query,Answer,Expected_Output,Contexts,Source_File
0,Analyze the rhetorical strategies employed by ...,,The speaker employs several rhetorical strateg...,or raise the retirement age I will stop them!...,Speeches/titleedits/state_of_the_union_030724.txt
1,"How might these policies, if implemented, impa...",,"These policies, if implemented, could have a s...",or raise the retirement age I will stop them!...,Speeches/titleedits/state_of_the_union_030724.txt
2,How does the speaker balance a positive view o...,,The speaker balances a positive view of econom...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt
3,Considering the speaker's claim of a sharp dec...,,The speaker emphasizes a decrease in the overa...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt
4,Given the President's stated aim of ending the...,,The President aims to reduce prescription drug...,and biggest corporations no longer get all th...,Speeches/titleedits/state_of_the_union_030724.txt
...,...,...,...,...,...
195,Explain how the president's call for aid to Ga...,,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt
196,"If a ceasefire were to fail, how would the Pre...",,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt
197,What steps is the US President taking to achie...,,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt
198,Identify the central theme of the President's ...,,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt


In [None]:
# generate answer column, per these two issues
# https://github.com/explodinggradients/ragas/issues/1145
# https://github.com/explodinggradients/ragas/issues/1084#issuecomment-2248219601
for j in range(10,200,10):
    query_engine = index.as_query_engine(similarity_top_k=10)
    answers = [query_engine.query(q) for q in testset_pd['Query'][j:j+10]]
    
    for i in answers:
        answers_r.append(i.response)
        context_n.append([c.node.get_content() for c in i.source_nodes])

    print(j)
    time.sleep(61)


In [29]:
# parse out new 'answer' and 'contexts' columns
answers_r = []
context_n = []

In [33]:
len(answers_r)

200

In [30]:

for i in answers:
    answers_r.append(i.response)
    context_n.append([c.node.get_content() for c in i.source_nodes])
  
#testset_pd = testset_pd.rename(columns={"Contexts":"Contexts_QueryGen"})


In [34]:
len(context_n)

200

In [None]:
testset_pd

In [35]:
testset_pd['Contexts_2024'] = context_n
testset_pd['Answer'] = answers_r

In [36]:
testset_pd

Unnamed: 0,Query,Answer,Expected_Output,Contexts,Source_File,Contexts_2024
0,Analyze the rhetorical strategies employed by ...,The speaker uses powerful rhetoric to galvaniz...,The speaker employs several rhetorical strateg...,or raise the retirement age I will stop them!...,Speeches/titleedits/state_of_the_union_030724.txt,[My fellow Americans the issue facing our nati...
1,"How might these policies, if implemented, impa...",The policies described aim to shift the distri...,"These policies, if implemented, could have a s...",or raise the retirement age I will stop them!...,Speeches/titleedits/state_of_the_union_030724.txt,[America’s comeback is building a future of Am...
2,How does the speaker balance a positive view o...,The speaker acknowledges the significant econo...,The speaker balances a positive view of econom...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,[Tonight I come to the same chamber to address...
3,Considering the speaker's claim of a sharp dec...,The speaker highlights a decrease in the overa...,The speaker emphasizes a decrease in the overa...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"[Last year, the murder rate saw the sharpest d..."
4,Given the President's stated aim of ending the...,The President's actions aim to reduce the cost...,The President aims to reduce prescription drug...,and biggest corporations no longer get all th...,Speeches/titleedits/state_of_the_union_030724.txt,[For years people have talked about it but I f...
...,...,...,...,...,...,...
195,Explain how the president's call for aid to Ga...,The president emphasizes the importance of inc...,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
196,"If a ceasefire were to fail, how would the Pre...",The President's proposed humanitarian efforts ...,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
197,What steps is the US President taking to achie...,The US President is directing the military to ...,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"[Tonight, I’m directing the U.S. military to l..."
198,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"[Like most Americans, I believe Roe v. Wade go..."


In [39]:
testset_pd.to_csv('datasets/unlabeled_dataset/24_answers.tsv', index=False, sep="\t")

In [10]:
# combine unlabeled datasets
data21 = pd.read_csv('datasets/unlabeled_dataset/21_answers.csv', index_col=None)
data22_1 = pd.read_csv('datasets/unlabeled_dataset/22_answers.csv', index_col=None)
data22_2 = pd.read_csv('datasets/unlabeled_dataset/22_answers_last.csv', index_col=None)
data23 = pd.read_csv('datasets/unlabeled_dataset/23_answers.csv', index_col=None)
data24 = pd.read_csv('datasets/unlabeled_dataset/24_answers.csv', index_col=None)

In [13]:
data22 = pd.concat([data22_1, data22_2], ignore_index=True)

In [17]:
data21_new = data21.rename(columns={"Contexts_2021": "Contexts_SourceFile"})
data22_new = data22.rename(columns={"Contexts_2022": "Contexts_SourceFile"})
data23_new = data23.rename(columns={"Contexts_2023": "Contexts_SourceFile"})
data24_new = data24.rename(columns={"Contexts_2024": "Contexts_SourceFile"})

In [21]:
unlabeled_data = pd.concat([data21_new, data22_new, data23_new, data24_new], ignore_index=True)

In [22]:
unlabeled_data

Unnamed: 0,Query,Answer,Expected_Output,Contexts,Source_File,Contexts_SourceFile
0,Identify specific examples of government inves...,The transcontinental railroad and the intersta...,The speech highlights several examples: the tr...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,"['Throughout our history, if you think about i..."
1,"Does the American Jobs Plan, a large-scale inv...",The plan seeks to create jobs by modernizing i...,The American Jobs Plan aims to create jobs by ...,; discovering vaccines; gave us the Internet a...,Speeches/titleedits/state_of_the_union_042921.txt,['The American Jobs Plan creates jobs replacin...
2,Considering the significant impact of cancer o...,Investing in cancer research is a priority bec...,Investing in cancer research is a priority bec...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"['But so many of us have deceased sons, daught..."
3,How does the President's viewpoint on infrastr...,The President believes that infrastructure inv...,The President emphasizes that infrastructure i...,"But so many of us have deceased sons, daughter...",Speeches/titleedits/state_of_the_union_042921.txt,"['Investments in jobs and infrastructure, like..."
4,Analyze the potential economic consequences of...,"A progressive tax structure, where higher earn...",The President advocates for raising taxes on c...,you should be able to become a billionaire an...,Speeches/titleedits/state_of_the_union_042921.txt,['When you hear someone say that they don’t wa...
...,...,...,...,...,...,...
795,Explain how the president's call for aid to Ga...,The president emphasizes the importance of inc...,The president's call for aid to Gaza is direct...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
796,"If a ceasefire were to fail, how would the Pre...",The President's proposed humanitarian efforts ...,The President's proposal for a temporary pier ...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
797,What steps is the US President taking to achie...,The US President is directing the military to ...,The US President is working to achieve a cease...,I say we must stop it. \n\nI’m proud we beat ...,Speeches/titleedits/state_of_the_union_030724.txt,"['Tonight, I’m directing the U.S. military to ..."
798,Identify the central theme of the President's ...,The President strongly advocates for reproduct...,The President's opening remarks regarding repr...,no place in America! \n\nHistory is watching....,Speeches/titleedits/state_of_the_union_030724.txt,"['Like most Americans, I believe Roe v. Wade g..."


In [24]:
unlabeled_data.to_csv('datasets/unlabeled_dataset/unlabeled_dataset.csv', index=False)