# **Project: Research Paper Answer Bot**

## Installing all the dependencies

In [None]:
!pip install langchain==0.2.0
!pip install langchain-openai==0.1.7
!pip install langchain-community==0.2.0
!pip install langgraph==0.1.1

# takes 2 - 5 mins to install on Colab
!pip install "unstructured[all-docs]==0.14.0"

!pip install jq==1.7.0
!pip install pypdf==4.2.0
!pip install pymupdf==1.24.4

!pip install langchain-text-splitters==0.2.0
!pip install tiktoken==0.7.0
!pip install spacy
!pip install sentence-transformers==2.7.0

!pip install langchain-huggingface==0.0.1

!pip install langchain-chroma

## Setup Key and Environmental Variables

In [None]:
from getpass import getpass

OPENAI_KEY = getpass('Enter Open AI API Key: ')

In [None]:
HUGGINGFACEHUB_API_TOKEN = getpass('Enter HuggingFace Auth Token Key: ')

In [None]:
TAVILY_API_KEY = getpass('Enter Tavily Search API Key: ')

In [None]:
import os

os.environ['OPENAI_API_KEY'] = OPENAI_KEY
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY

## Load Connection to LLM

In [None]:
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name='gpt-4o', temperature=0)

## **Implementing Compulsary Goals**

### Load the Files and setup vector database

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

# Define a dictionary to map file extensions to their respective loaders
loaders = {
    '.pdf': (PyMuPDFLoader, {}),
    # '.docx': (UnstructuredWordDocumentLoader, {'strategy': 'fast',
    #                                           'chunking_strategy' : 'by_title',
    #                                           'max_characters' : 3000, # max limit of a document chunk
    #                                           'new_after_n_chars' : 2500, # preferred document chunk size
    #                                           'mode' : 'elements'
    #                                           })
}

In [None]:
from langchain_community.document_loaders import DirectoryLoader

# Define a function to create a DirectoryLoader for a specific file type
def create_directory_loader(file_type, directory_path):
    return DirectoryLoader(
        path=directory_path,
        glob=f"**/*{file_type}",
        loader_cls=loaders[file_type][0],
        loader_kwargs=loaders[file_type][1],
        show_progress=True
    )

# Create DirectoryLoader instances for each file type
pdf_loader = create_directory_loader('.pdf', '/content/drive/MyDrive/Agents/Capstone Project/pinnacle_capstone_data')
# docx_loader = create_directory_loader('.docx', './')

# Load the files
pdf_documents = pdf_loader.load()
# docx_documents = docx_loader.load()

In [None]:
len(pdf_documents)

In [None]:
pdf_documents[18]

In [None]:
type(pdf_documents)

In [None]:
docs = pdf_documents

**Create LangChain Documents**

In [None]:
from langchain.docstore.document import Document

docs = [Document(page_content=doc.page_content,
                 metadata=doc.metadata) for doc in docs]

In [None]:
docs[:3]

In [None]:
len(docs)

**Split larger documents into smaller chunks**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
chunked_docs = splitter.split_documents(docs)

In [None]:
chunked_docs[:3]

In [None]:
len(chunked_docs)

**Experiment with different embedding models**

**openAI embeddings**

In [None]:
from langchain_openai import OpenAIEmbeddings

# details here: https://openai.com/blog/new-embedding-models-and-api-updates
openai_embed_model = OpenAIEmbeddings(model='text-embedding-3-small')

In [None]:
# Extract the text content from each document
chunked_docs_texts = [chunked_docs.page_content for chunked_docs in chunked_docs]

# Pass the extracted text content to the embedding model
embeddings = openai_embed_model.embed_documents(chunked_docs_texts)

In [None]:
len(embeddings)

In [None]:
len(embeddings[10])

In [None]:
print(embeddings[10])

**Open Source Embedding Models on HuggingFace**


In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# check out model details here: https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
model_name = "mixedbread-ai/mxbai-embed-large-v1"

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
)

In [None]:
embeddings = hf_embeddings.embed_documents(chunked_docs_texts)

In [None]:
len(embeddings)

In [None]:
len(embeddings[0])

In [None]:
print(embeddings[0])

**Create a Vector DB and persist on disk**


In [None]:
!rm -rf "/content/research_papers_db" #replace path to db

In [None]:
from langchain_chroma import Chroma

# create vector DB of docs and embeddings - takes < 30s on Colab
chroma_db = Chroma.from_documents(documents=chunked_docs,
                                  collection_name='research_papers_chroma_db',
                                  embedding=openai_embed_model,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="/content/research_papers_db")

**Load Vector DB from disk**



In [None]:
# load from disk
research_papers_chroma_db = Chroma(persist_directory="/content/research_papers_db",
                   collection_name='research_papers_chroma_db',
                   embedding_function=openai_embed_model)

In [None]:
research_papers_chroma_db

### Experiment with different retrieval stratagies

**ContexualCompressionretriever**

In [None]:
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever

In [None]:
# simple cosine distance based retriever
similarity_retriever = research_papers_chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 3})

#  decides which of the initially retrieved documents to filter out and which ones to return
_filter = LLMChainFilter.from_llm(llm=chatgpt)

# retrieves the documents similar to query and then applies the filter
compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=similarity_retriever
)

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
docs = compression_retriever.invoke(query)
docs

**MultiQueryRetriever**

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
# Set logging for the queries
import logging

similarity_retriever = research_papers_chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 3})

mq_retriever = MultiQueryRetriever.from_llm(
    retriever=similarity_retriever, llm=chatgpt
)

logging.basicConfig()
# so we can see what queries are generated by the LLM
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
docs = mq_retriever.invoke(query)
docs

**Chained Retrieval Pipeline**

In [None]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker

# Retriever 1 - simple cosine distance based retriever
similarity_retriever = research_papers_chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})

#  decides which of the initially retrieved documents to filter out and which ones to return
_filter = LLMChainFilter.from_llm(llm=chatgpt)
# Retriever 2 - retrieves the documents similar to query and then applies the filter
compressor_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=similarity_retriever
)

# download an open-source reranker model - BAAI/bge-reranker-v2-m3
reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-large")
reranker_compressor = CrossEncoderReranker(model=reranker, top_n=3)
# Retriever 3 - Uses a Reranker model to rerank retrieval results from the previous retriever
final_retriever = ContextualCompressionRetriever(
    base_compressor=reranker_compressor, base_retriever=compressor_retriever
)

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
docs = final_retriever.invoke(query)
docs

### QA RAG System


In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If no context is present or if you don't know the answer, just say that you don't know.
            Do not make up the answer unless it is there in the provided context.
            Give a detailed answer with regard to the question.

            Question:
            {question}

            Context:
            {context}

            Answer:
         """

prompt_template = ChatPromptTemplate.from_template(prompt)

In [None]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (final_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    prompt_template
      |
    chatgpt
)

In [None]:
from IPython.display import Markdown, display

# Helper function to display answer and sources
def display_answer_with_sources(query):
    # Run the RAG chain and get the result
    result = qa_rag_chain.invoke(query)

    # Retrieve the top 3 context documents (the sources)
    top_docs = final_retriever.invoke(query)[:3]  # Limit to top 3
    sources = format_docs(top_docs)

    # Display the generated answer
    display(Markdown(f"### Answer:\n{result.content}"))

    # Display the sources
    display(Markdown("### Sources (Top 3 Retrieved Documents):"))
    display(Markdown(sources))

# Test query: attention mechanisms in the Attention paper
query = "What attention mechanisms were used in the Attention paper?"
display_answer_with_sources(query)

In [None]:
query = "What novel approaches did the Gemini paper introduce in LLM training?"
display_answer_with_sources(query)

In [None]:
query = "What datasets were used for training in the GPT-4 paper?"
display_answer_with_sources(query)

## **Implementing Stretch Goal: Advanced Option 3**

### Create a Query Retrieval Grader

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


# Data model for LLM output format
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""
    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM for grading
llm = chatgpt
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt template for grading
SYS_PROMPT = """You are an expert grader assessing relevance of a retrieved document to a user question.
                Follow these instructions for grading:
                  - If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant.
                  - Your grade should be either 'yes' or 'no' to indicate whether the document is relevant to the question or not.
             """
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", SYS_PROMPT),
        ("human", """Retrieved document:
                     {document}

                     User question:
                     {question}
                  """),
    ]
)

# Build grader chain
doc_grader = (grade_prompt
                  |
              structured_llm_grader)

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
top3_docs = final_retriever.invoke(query)
for doc in top3_docs:
    print(doc.page_content)
    print('GRADE:', doc_grader.invoke({"question": query, "document": doc.page_content}))
    print()

### Build a QA RAG Chain

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

prompt = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If no context is present or if you don't know the answer, just say that you don't know the answer.
            Do not make up the answer unless it is there in the provided context.
            However, if there are any web search results, always consider them in your response.
            Give a detailed answer and to the point answer with regard to the question.

            Question:
            {question}

            Context:
            {context}

            Answer:
         """
prompt_template = ChatPromptTemplate.from_template(prompt)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (itemgetter('context')
                        |
                    RunnableLambda(format_docs)),
        "question": itemgetter('question')
    }
      |
    prompt_template
      |
    chatgpt
      |
    StrOutputParser()
)

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
top3_docs = final_retriever.invoke(query)
result = qa_rag_chain.invoke(
    {"context": top3_docs, "question": query}
)
print(result)

### Create a Query Rephraser

In [None]:
# Prompt template for rewriting
SYS_PROMPT = """Act as a question re-writer and perform the following task:
                 - Convert the following input question to a better version that is optimized for web search.
                 - When re-writing, look at the input question and try to reason about the underlying semantic intent / meaning.
             """
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", SYS_PROMPT),
        ("human", """Here is the initial question:
                     {question}

                     Formulate an improved question.
                  """,
        ),
    ]
)

question_rewriter = (re_write_prompt
                        |
                       chatgpt
                        |
                     StrOutputParser())

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
question_rewriter.invoke({"question": query})

### Load Web Search Tool

In [None]:
from langchain_community.tools.tavily_search import TavilySearchResults

tv_search = TavilySearchResults(max_results=3, search_depth='advanced',
                                max_tokens=10000)

### Build Agentic RAG components

Here we will build the key components of our Agentic Corrective RAG System as per the workflow below:

![](https://i.imgur.com/uhybMhT.png)



### Graph State


In [None]:
from typing import List
from typing_extensions import TypedDict

class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM response generation
        web_search_needed: flag of whether to add web search - yes or no
        documents: list of context documents
    """

    question: str
    generation: str
    web_search_needed: str
    documents: List[str]

### Retrieve function for retrieval from Vector DB

In [None]:
def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents - that contains retrieved context documents
    """
    print("---RETRIEVAL FROM VECTOR DB---")
    question = state["question"]

    # Retrieval
    documents = final_retriever.invoke(question)
    return {"documents": documents, "question": question}

### Grade documents

In [None]:
def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question
    by using an LLM Grader.

    If any document are not relevant to question or documents are empty - Web Search needs to be done
    If all documents are relevant to question - Web Search is not needed
    Helps filtering out irrelevant documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    web_search_needed = "Yes"
    if documents:
        for d in documents:
            score = doc_grader.invoke(
                {"question": question, "document": d.page_content}
            )
            grade = score.binary_score
            if grade == "yes":
                print("---GRADE: DOCUMENT RELEVANT---")
                filtered_docs.append(d)
                web_search_needed = "No"
            else:
                print("---GRADE: DOCUMENT NOT RELEVANT---")
    else:
        print("---NO DOCUMENTS RETRIEVED---")

    return {"documents": filtered_docs, "question": question, "web_search_needed": web_search_needed}

### Rewrite query

In [None]:
def rewrite_query(state):
    """
    Rewrite the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased or re-written question
    """

    print("---REWRITE QUERY---")
    question = state["question"]
    documents = state["documents"]

    # Re-write question
    better_question = question_rewriter.invoke({"question": question})
    return {"documents": documents, "question": better_question}

### Web Search

In [None]:
from langchain.schema import Document

def web_search(state):
    """
    Web search based on the re-written question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with appended web results
    """

    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]

    # Web search
    docs = tv_search.invoke(question)
    print("---WEB SEARCH RESULTS---")
    for doc in docs:
        print(doc["content"])

    for d in docs:
        documents.append(Document(page_content=d["content"]))

    # web_results = "\n\n".join([d["content"] for d in docs])
    # web_results = Document(page_content=web_results)
    # documents.append(web_results)

    return {"documents": documents, "question": question}

### Generate Answer

In [None]:
def generate_answer(state):
    """
    Generate answer from context document using LLM

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---GENERATE ANSWER---")
    question = state["question"]
    documents = state["documents"]

    # Ensure there is context before attempting to generate an answer
    if documents:
        generation = qa_rag_chain.invoke({"context": documents, "question": question})
    else:
        generation = "I don't know the answer. The context provided does not contain information to answer the question."

    return {"documents": documents, "question": question, "generation": generation}

### Decide to Generate

In [None]:
def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    web_search_needed = state["web_search_needed"]

    if web_search_needed == "Yes":
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print("---DECISION: SOME or ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, REWRITE QUERY---")
        return "rewrite_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE RESPONSE---")
        return "generate_answer"

### Build the Agent Graph

In [None]:
from langgraph.graph import END, StateGraph

agentic_rag = StateGraph(GraphState)

# Define the nodes
agentic_rag.add_node("retrieve", retrieve)  # retrieve
agentic_rag.add_node("grade_documents", grade_documents)  # grade documents
agentic_rag.add_node("rewrite_query", rewrite_query)  # transform_query
agentic_rag.add_node("web_search", web_search)  # web search
agentic_rag.add_node("generate_answer", generate_answer)  # generate answer

# Build graph
agentic_rag.set_entry_point("retrieve")
agentic_rag.add_edge("retrieve", "grade_documents")
agentic_rag.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {"rewrite_query": "rewrite_query", "generate_answer": "generate_answer"},
)
agentic_rag.add_edge("rewrite_query", "web_search")
agentic_rag.add_edge("web_search", "generate_answer")
agentic_rag.add_edge("generate_answer", END)

# Compile
agentic_rag = agentic_rag.compile()

In [None]:
from IPython.display import Image, display, Markdown

display(Image(agentic_rag.get_graph().draw_mermaid_png()))

### Test the Agentic CRAG System

In [None]:
query = "What attention mechanisms were used in the Attention paper?"
response = agentic_rag.invoke({"question": query})

In [None]:
display(Markdown(response['generation']))

In [None]:
response

In [None]:
query = "What novel approaches did the Gemini paper introduce in LLM training?"
response = agentic_rag.invoke({"question": query})

In [None]:
display(Markdown(response['generation']))

In [None]:
query = "What datasets were used for training in the GPT-4 paper?"
response = agentic_rag.invoke({"question": query})

In [None]:
display(Markdown(response['generation']))