In [1]:
!pip install langchain langchain-text-splitters langchain-community bs4

Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting bs4
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting langchain-core<0.2,>=0.1 (from langchain)
  Downloading langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of langchain-text-splitters to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-text-splitters
  Using cached langchain_text_splitters-0.3.11-py3-none-any.whl.metadata (1.8 kB)
  Using cached langchain_text_splitters-0.3.10-py3-none-any.whl.metadata (1.9 kB)
  Using cached langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
  Using cached langchain_text_splitt


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")

In [3]:
!pip install -U "langchain[openai]"




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from langchain.chat_models import init_chat_model
model = init_chat_model("gpt-4.1")

In [5]:
!pip install -U "langchain-openai"




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [7]:
!pip install -U "langchain-core"




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

## Document Loading

In [12]:
from langchain_community.document_loaders import PyPDFLoader

# Get the absolute path to the PDF file
pdf_path = os.path.join(os.path.dirname(os.getcwd()), "docs", "AEM1.pdf")
loader = PyPDFLoader(pdf_path)
pages = loader.load()

In [13]:
# number of pages loaded (the PDF has 43 pages)
len(pages)

43

In [14]:
# Let's see the third page (Editor's Letter)
page3 = pages[2]
print(page3.page_content[:500])  # print the first 500 characters

prosperoenglish.com www.youtube.com/c/ProsperoEnglish 
 
3 Your American English Magazine | 1/2022 
 
Editor’s Letter 
Hey there, I’m excited to deliver this very first 
issue of Your American English Magazine to you. I 
don’t know how this new magazine will be doing. 
Hopefully well, because if there is interest in it, I’ll 
be publishing more issues on a regular basis. 
So, what will you find in this issue? First, there’s a 
story, A Winter Hike. I’m planning to write a story 
like that in eac


In [15]:
# Let's see the metadata of the third page
page3.metadata

{'producer': 'Microsoft® Office Word 2007',
 'creator': 'Microsoft® Office Word 2007',
 'creationdate': '2022-02-17T14:01:22+01:00',
 'title': 'prosperoenglish.com',
 'author': 'Victor',
 'moddate': '2022-02-17T14:01:22+01:00',
 'source': 'd:\\Projects\\langchain_rag_chatbot\\docs\\AEM1.pdf',
 'total_pages': 43,
 'page': 2,
 'page_label': '3'}

## Document Splitting

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

docs = text_splitter.split_documents(pages)

In [17]:
# number of documents after splitting
len(docs)

98

## Embedding and Storing

In [18]:
doc_ids = vector_store.add_documents(documents=docs)
print(len(doc_ids))

98


In [19]:
print(doc_ids[0])

4a9271c1-bbc9-4ea8-ba1a-041865f6b864


## RAG Agent

In [20]:
from langchain.tools import tool

In [22]:
# RAG agent
@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [23]:
from langchain.agents import create_agent

In [24]:
tools = [retrieve]

system_prompt = (
    "You have access to a tool that retrieves context from a PDF document. "
    "Use it to better answer user queries."
)

agent = create_agent(model, tools, system_prompt=system_prompt)

## Generation

In [27]:
query = (
    "Who are Frieda, Borg, Kev, Ike and Bree to one another?\n\n"
    "How old are they?\n\n"
    "What is the reason why Kev and Ike initially didn't go with their folks for a hike."
)

In [30]:
for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values"
):
    event["messages"][-1].pretty_print()


Who are Frieda, Borg, Kev, Ike and Bree to one another?

How old are they?

What is the reason why Kev and Ike initially didn't go with their folks for a hike.
Tool Calls:
  retrieve (call_oZ6YOT4k2DEWM5oVlEZeEkXc)
 Call ID: call_oZ6YOT4k2DEWM5oVlEZeEkXc
  Args:
    query: relationships between Frieda, Borg, Kev, Ike, and Bree
  retrieve (call_q5dIag4GxbAEc994T7arrIb5)
 Call ID: call_q5dIag4GxbAEc994T7arrIb5
  Args:
    query: ages of Frieda, Borg, Kev, Ike, and Bree
  retrieve (call_zjx6zpcHBDSpbVzbTee2xjIj)
 Call ID: call_zjx6zpcHBDSpbVzbTee2xjIj
  Args:
    query: why Kev and Ike did not initially go for a hike with their folks
Name: retrieve

Source: {'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2022-02-17T14:01:22+01:00', 'title': 'prosperoenglish.com', 'author': 'Victor', 'moddate': '2022-02-17T14:01:22+01:00', 'source': 'd:\\Projects\\langchain_rag_chatbot\\docs\\AEM1.pdf', 'total_pages': 43, 'page': 8, 'page_label': '9',