In [None]:
# 1. Merge PDFs (already in your notebook)
from PyPDF2 import PdfMerger
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import AzureChatOpenAI
import os
import getpass
from langchain_openai import AzureChatOpenAI

pdf_folder = "/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/Input_Data/health-plan"
output_pdf_path = "/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/Output_Data/merged_output.pdf"

pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
merger = PdfMerger()
for pdf in pdf_files:
    merger.append(pdf)
merger.write(output_pdf_path)
merger.close()

# 2. Load merged PDF
loader = PyPDFLoader(output_pdf_path)
documents = loader.load()
print(f"Loaded {len(documents)} pages from merged file.")

# 3. Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=850,
    chunk_overlap=100,
    add_start_index=True,
)
all_splits = text_splitter.split_documents(documents)
print(f"Splitted the merged document into {len(all_splits)} sub-documents.")

# 4. Embed and store with Azure OpenAI
## insert the LLM and embedding models here


vectorstore = FAISS.from_documents(all_splits, embedding_model)
vectorstore.save_local("/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/vector_index")

Loaded 221 pages from merged file.
Splitted the merged document into 779 sub-documents.


In [2]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [3]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [4]:
total_documents = len(all_splits)
third = total_documents // 3

for i, document in enumerate(all_splits):
    if i < third:
        document.metadata["section"] = "beginning"
    elif i < 2 * third:
        document.metadata["section"] = "middle"
    else:
        document.metadata["section"] = "end"


all_splits[2].metadata

{'producer': 'PyPDF2',
 'creator': 'PyPDF',
 'creationdate': '',
 'source': '/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/Output_Data/merged_output.pdf',
 'total_pages': 221,
 'page': 2,
 'page_label': '3',
 'start_index': 0,
 'section': 'beginning'}

In [5]:
from langchain_core.vectorstores import InMemoryVectorStore

vectorstore = InMemoryVectorStore(embedding_model)
_ = vectorstore.add_documents(all_splits)

In [6]:
from langgraph.graph import MessagesState, StateGraph

graph_builder = StateGraph(MessagesState)

In [7]:
from langchain_core.tools import tool
from typing_extensions import Annotated, List, TypedDict
from typing import Literal
from typing_extensions import Annotated


class Search(TypedDict):
    """Search query."""

    query: Annotated[str, ..., "Search query to run."]
    section: Annotated[
        Literal["beginning", "middle", "end"],
        ...,
        "Section to query.",
    ]

class State(TypedDict):
    question: str
    query: Search
    context: List[Document]
    answer: str


@tool(response_format="content_and_artifact")
def retrieve(state: State, vectorstore):
    """Retrieve relevant documents based on the query and section."""
    query = state["query"]
    retrieved_docs = vectorstore.similarity_search(
        query["query"],
        filter=lambda doc: doc.metadata.get("section") == query["section"],
    )
    return {"context": retrieved_docs}

NameError: name 'Document' is not defined

In [None]:
from langchain_core.messages import SystemMessage
from langgraph.prebuilt import ToolNode


# Step 1: Generate an AIMessage that may include a tool-call to be sent.
def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    # MessagesState appends messages to state instead of overwriting
    return {"messages": [response]}


# Step 2: Execute the retrieval.
tools = ToolNode([retrieve])


# Step 3: Generate a response using the retrieved content.
def generate(state: MessagesState):
    """Generate answer."""
    # Get generated ToolMessages
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]

    # Format into prompt
    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    # Run
    response = llm.invoke(prompt)
    return {"messages": [response]}

In [None]:
from langgraph.graph import END
from langgraph.prebuilt import ToolNode, tools_condition

graph_builder.add_node(query_or_respond)
graph_builder.add_node(tools)
graph_builder.add_node(generate)

graph_builder.set_entry_point("query_or_respond")
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {END: END, "tools": "tools"},
)
graph_builder.add_edge("tools", "generate")
graph_builder.add_edge("generate", END)

graph = graph_builder.compile()

In [None]:
input_message = "What is the mission of Contoso Electronics?"

for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


What is the mission of Contoso Electronics?
Tool Calls:
  retrieve (call_eW8UNS14sCcJWqyIrP5VMBDo)
 Call ID: call_eW8UNS14sCcJWqyIrP5VMBDo
  Args:
    query: mission of Contoso Electronics
Name: retrieve

Source: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'source': '/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/Output_Data/merged_output.pdf', 'total_pages': 221, 'page': 217, 'page_label': '218', 'start_index': 0, 'section': 'end'}
Content: Contoso Electronics 
Plan and Benefit Packages

Source: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'source': '/home/tulasiram/ubuntu_server/RAG_ChatApp_LangChain/Output_Data/merged_output.pdf', 'total_pages': 221, 'page': 220, 'page_label': '221', 'start_index': 774, 'section': 'end'}
Content: Health Plus and Northwind Standard. We 
are confident that you will find the right plan for you and 
your family. Thank you for choosing Contoso Electronics!

I don't know.
