### Hypothetical Prompt Embeddings (HyPE)

##### Overview
This code implements a Retrieval-Augmented Generation (RAG) system enhanced by Hypothetical Prompt Embeddings (HyPE). Unlike traditional RAG pipelines that struggle with query-document style mismatch, HyPE precomputes hypothetical questions during the indexing phase. This transforms retrieval into a question-question matching problem, eliminating the need for expensive runtime query expansion techniques.

In [12]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from utility import encode_pdf, show_context, retrieve_context_per_question
from langchain_core.output_parsers import StrOutputParser
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.docstore.in_memory import InMemoryDocstore
from tqdm import tqdm
from langchain.vectorstores import Chroma, FAISS
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from utility import replace_t_with_space

In [3]:
path = "data/Understanding_Climate_Change.pdf"
groq_api_key=os.getenv("GROQ_API_KEY")

In [23]:
class HyPERetriever:
    def __init__(self,file_path,chunk_size=1000,chunk_overlap=200):
        self.file_path = file_path
        self.llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant")
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        self.question_gen_prompt = PromptTemplate(
            template = """Analyze the input text and generate essential questions that, when answered, 
        capture the main points of the text. Each question should be one line, 
        without numbering or prefixes.\n\n 
        Text:\n{chunk_text}\n\nQuestions:\n""",
            input_variables=['chunk_text']
        )

        self.question_chain = self.question_gen_prompt | self.llm | StrOutputParser()
    
    def generate_questions(self,chunk_text):
        """
        Uses the LLM to generate multiple hypothetical questions for a single chunk.
        These questions will be used as 'proxies' for the chunk during retrieval.

        Parameters:
        chunk_text (str): Text contents of the chunk

        Returns:
        chunk_text (str): Text contents of the chunk. This is done to make the 
            multithreading easier
        hypothetical prompt embeddings (List[float]): A list of embedding vectors
            generated from the questions
        """
        response = self.question_chain.invoke({'chunk_text':chunk_text}).replace("\n\n", "\n").split("\n")
        return chunk_text,self.embeddings.embed_documents(response)
    
    def create_vectorstore(self,chunks : List[str]):
        """
        Creates and populates a FAISS vector store from a list of text chunks.

        This function processes a list of text chunks in parallel, generating 
        hypothetical prompt embeddings for each chunk.
        The embeddings are stored in a FAISS index for efficient similarity search.

        Parameters:
        chunks (List[str]): A list of text chunks to be embedded and stored.

        Returns:
        FAISS: A FAISS vector store containing the embedded text chunks.
        """
        vector_store = None

        with ThreadPoolExecutor() as pool:
            #Use threading to speed up the generation of prompt embeddings
            futures = [pool.submit(self.generate_questions,c) for c in chunks]
            #process embeddings as they complete
            for f in tqdm(as_completed(futures), total=(len(chunks)/3)):
                chunk,vectors = f.result()  #Retrieved the processed chunk and its embeddings

                #initialize the faiss vectorstore on first chunk
                if vector_store == None:
                    vector_store = FAISS(
                        embedding_function=self.embeddings,  #define embedding model
                        index = faiss.IndexFlatL2(len(vectors[0])),  #define L2 index for similarity search
                        docstore = InMemoryDocstore(),  # Use in memory document storage
                        index_to_docstore_id={}   # maintain index to document mapping
                    )
                
                # Pair the chunk's content with each generated embedding vector.
                # Each chunk is inserted multiple times, once for each prompt vector
                chunks_with_embedding_vector = [(chunk.page_content,vec) for vec in vectors] 

                #Add embeddings to the vector store
                vector_store.add_embeddings(chunks_with_embedding_vector)
        return vector_store

    def encode_pdf(self,path):
        """
        Encodes a PDF book into a vector store using HuggingFace embeddings.

        Args:
            path: The path to the PDF file.
            chunk_size: The desired size of each text chunk.
            chunk_overlap: The amount of overlap between consecutive chunks.

        Returns:
            A FAISS vector store containing the encoded book content.
        """
        #Load the Pdf file 
        loader=PyPDFLoader(path)
        docs=loader.load()

        #Split the documents into chunks
        splitter=RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len
        )

        texts=splitter.split_documents(docs)
        cleaned_texts=replace_t_with_space(texts[0:20])

        #Create vector store
        vectorstore=self.create_vectorstore(cleaned_texts)

        return vectorstore

    def retriever(self,vectorstore):
        chunk_query_retriever = vectorstore.as_retriever(search_kwargs={'k':3})
        return chunk_query_retriever


#### Test Retriever

In [25]:
#ret_object = HyPERetriever(path)
#vector_store = ret_object.encode_pdf(path)
#ret = ret_object.retriever(vector_store)
#test_query = "What is the main cause of climate change?"
#context = retrieve_context_per_question(test_query,ret)
#show_context(context)