In [1]:
import os
import sys
import argparse
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
from helper_functions import *
from evaluation.rag_evaluation import *


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from helper_functions import *


In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [5]:
class HyDERetriever:
    def __init__(self, files_path, chunk_size=500, chunk_overlap=100):
        self.llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini", max_tokens=4000)
        self.embeddings = OpenAIEmbeddings()
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectorstore = encode_pdf(files_path, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        self.hyde_prompt = PromptTemplate(
            input_variables=["query", "chunk_size"],
            template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            The document size has to be exactly {chunk_size} characters.""",
        )
        self.hyde_chain = self.hyde_prompt | self.llm

    def generate_hypothetical_document(self, query):
        input_variables = {"query": query, "chunk_size": self.chunk_size}
        return self.hyde_chain.invoke(input_variables).content

    def retrieve(self, query, k=3):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectorstore.similarity_search(hypothetical_doc, k=k)
        return similar_docs, hypothetical_doc


In [6]:
class ClimateChangeRAG:
    def __init__(self, path, query):
        self.retriever = HyDERetriever(path)
        self.query = query

    def run(self):
        # Retrieve results and hypothetical document
        results, hypothetical_doc = self.retriever.retrieve(self.query)

        # Plot the hypothetical document and the retrieved documents
        docs_content = [doc.page_content for doc in results]

        print("Hypothetical document:\n")
        print(text_wrap(hypothetical_doc) + "\n")
        show_context(docs_content)

In [9]:
def run_and_return_hyde_results(rag_obj):
    """
    Runs HyDE RAG retrieval and returns:
    - hypothetical document
    - retrieved chunk texts
    """
    print(f"\n Query: {rag_obj.query}")
    
    results, hypothetical_doc = rag_obj.retriever.retrieve(rag_obj.query)
    docs_content = [doc.page_content for doc in results]

    print("\n Hypothetical Document:")
    print(text_wrap(hypothetical_doc))

    print("\n Retrieved Chunks:")
    for i, chunk in enumerate(docs_content):
        print(f"\n--- Chunk {i+1} ---\n{text_wrap(chunk)}")

    return hypothetical_doc, docs_content


In [10]:
def evaluate_hyde_retriever(pdf_path, query_answer_pairs, chunk_size=500, chunk_overlap=100, k=3):
    """
    Evaluates HyDE retriever on a list of (query, expected snippet) pairs.

    Returns list of hits for each query.
    """
    hits = []

    retriever = HyDERetriever(pdf_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    print(f"\n Running evaluation on {len(query_answer_pairs)} queries...\n")

    for i, (query, expected_snippet) in enumerate(query_answer_pairs, 1):
        print(f"\n Query {i}: {query}")
        hypothetical_doc = retriever.generate_hypothetical_document(query)
        results = retriever.vectorstore.similarity_search(hypothetical_doc, k=k)
        retrieved_text = " ".join([r.page_content for r in results]).lower()

        match = expected_snippet.lower() in retrieved_text
        hits.append(match)

        print(f" Match found: {match}")
        print(f"Expected snippet: \"{expected_snippet}\"\n")

    accuracy = sum(hits) / len(hits)
    print(f"\n Final Accuracy: {accuracy * 100:.2f}%")

    return hits

In [13]:
query = "What is the main cause of climate change?"
rag_runner = ClimateChangeRAG(r"C:\Users\Revathi\Documents\GenAIProjects\All_RAG_Techniques\data\Understanding_Climate_Change.pdf", query)
hypo_doc, retrieved_chunks = run_and_return_hyde_results(rag_runner)


 Query: What is the main cause of climate change?

 Hypothetical Document:
**The Main Cause of Climate Change**  Climate change primarily results from human activities, particularly the burning
of fossil fuels such as coal, oil, and natural gas. This process releases significant amounts of carbon dioxide (CO2)
and other greenhouse gases into the atmosphere, enhancing the greenhouse effect. Deforestation further exacerbates the
issue by reducing the number of trees that can absorb CO2. Additionally, industrial processes, agriculture, and waste
management contribute to emissions. Collectively, these factors disrupt the Earth's climate systems, leading to global
warming and associated environmental impacts.

 Retrieved Chunks:

--- Chunk 1 ---
predict future trends. The evidence overwhelmingly shows that recent changes are primarily  driven by human activities,
particularly the emission of greenhouse gases.  Chapter 2: Causes of Climate Change  Greenhouse Gases  The primary cause
of rece

In [15]:
query_answer_pairs = [
    ("What is the main cause of climate change?", "burning of fossil fuels"),
    ("What are the effects of climate change?", "rising sea levels"),
    ("What are greenhouse gases?", "carbon dioxide and methane"),
]

hits = evaluate_hyde_retriever(
    pdf_path=r"C:\Users\Revathi\Documents\GenAIProjects\All_RAG_Techniques\data\Understanding_Climate_Change.pdf",
    query_answer_pairs=query_answer_pairs,
    chunk_size=500,
    chunk_overlap=100,
    k=3
)


 Running evaluation on 3 queries...


 Query 1: What is the main cause of climate change?
 Match found: True
Expected snippet: "burning of fossil fuels"


 Query 2: What are the effects of climate change?
 Match found: False
Expected snippet: "rising sea levels"


 Query 3: What are greenhouse gases?
 Match found: False
Expected snippet: "carbon dioxide and methane"


 Final Accuracy: 33.33%
