In [None]:
!pip install langchain langchain-text-splitters langchain-community bs4

## Setup

In [None]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")

In [None]:
!pip install -U "langchain[openai]"

In [None]:
from langchain.chat_models import init_chat_model
model = init_chat_model("gpt-4.1")

In [None]:
!pip install -U "langchain-openai"

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
!pip install -U "langchain-core"

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

## Document Loading

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# Get the absolute path to the PDF file
pdf_path = os.path.join(os.path.dirname(os.getcwd()), "docs", "AEM1.pdf")
loader = PyPDFLoader(pdf_path)
pages = loader.load()

In [None]:
# number of pages loaded (the PDF has 43 pages)
len(pages)

In [None]:
# Let's see the third page (Editor's Letter)
page3 = pages[2]
print(page3.page_content[:500])  # print the first 500 characters

In [None]:
# Let's see the metadata of the third page
page3.metadata

## Document Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

docs = text_splitter.split_documents(pages)

In [None]:
# number of documents after splitting
len(docs)

## Embedding and Storing

In [None]:
doc_ids = vector_store.add_documents(documents=docs)
print(len(doc_ids))

In [None]:
print(doc_ids[0])

## RAG Agent

In [None]:
from langchain.tools import tool

In [None]:
# RAG agent
@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [None]:
from langchain.agents import create_agent

In [None]:
tools = [retrieve]

system_prompt = (
    "You have access to a tool that retrieves context from a PDF document. "
    "Use it to better answer user queries."
)

agent = create_agent(model, tools, system_prompt=system_prompt)

## Generation

In [None]:
query = (
    "Who are Frieda, Borg, Kev, Ike and Bree to one another?\n\n"
    "How old are they?\n\n"
    "What is the reason why Kev and Ike initially didn't go with their folks for a hike."
)

In [None]:
for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values"
):
    event["messages"][-1].pretty_print()