RAG Architecture Chatbot 


In [23]:
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

True

In [24]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [25]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader('./data/pdf')

docs = loader.load()

docs[0].page_content

'HP  Notebook  PCs  -  Testing  and  calibrating  \nthe\n \nbattery\n \n(Windows)\n Calibrate  the  battery  manually  \nManually  calibrating  the  battery  requires  you  to  discharge  or  drain  the  battery,  and  then  \nto\n \nrecharge\n \nthe\n \nbattery.\n \nThese\n \ninstructions\n \nwork\n \nwith\n \nmost\n \nbattery\n \ntypes.\n \nRun  a  battery  test  after  calibrating  the  battery.  Calibrating  the  battery  improves  the  \naccuracy\n \nof\n \nthe\n \nbattery\n \ngauge\n \nsoftware\n \nso\n \nthat\n \nit\n \nmore\n \naccurately\n \ndisplays\n \nthe\n \ncurrent\n \nlevel\n \nof\n \ncharge.\n \nNote:  You  do  not  need  to  fully  charge  the  battery  before  you  calibrate  it.  Turn  on  the  \ncomputer\n \nand\n \nuse\n \nHP\n \nPC\n \nHardware\n \nDiagnostics\n \nUEFI\n \nto\n \ndischarge\n \nthe\n \nbattery.\n \nYou  do  need  to  update  HP  PC  Hardware  Diagnostics  UEFI  to  the  latest  version.  To  \ndownload\n \nthe\n \nlatest\n \nversion,\n \ngo\n \nto\

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re
from langchain_core.documents import Document

def clean_text(text: str) -> str:
    # Replace multiple newlines/tabs with a single space
    text = re.sub(r"\s*\n\s*", " ", text)
    return text.strip()


cleaned_docs = [
    Document(
        page_content=clean_text(doc.page_content),
        metadata=doc.metadata
    )
    for doc in docs
]


text_splitter = RecursiveCharacterTextSplitter(

    chunk_size=200,
    chunk_overlap=50,
    add_start_index=True,
)
all_splits = text_splitter.split_documents(cleaned_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 469 sub-documents.


In [53]:
all_splits

[Document(metadata={'producer': 'Skia/PDF m145 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'HP Notebook PCs - Testing and calibrating the battery (Windows)', 'source': 'data\\pdf\\battery.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content='HP  Notebook  PCs  -  Testing  and  calibrating the battery (Windows) Calibrate  the  battery  manually Manually  calibrating  the  battery  requires  you  to  discharge  or  drain  the  battery,  and'),
 Document(metadata={'producer': 'Skia/PDF m145 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'HP Notebook PCs - Testing and calibrating the battery (Windows)', 'source': 'data\\pdf\\battery.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'start_index': 151}, page_content='you  to  discharge  or  drain  the  battery,  and  then to recharge the battery. These instructions work with most battery types. Run  a  battery  test  after  calibrating  the  battery.  

In [45]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [46]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [47]:

document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['3c5e2352-f6d2-4f12-bb4d-510a9fe01327', '309de1e0-70bd-4992-8cea-e3a6e98ed4a6', 'b9bd3d1c-2326-4aad-91b5-8d992536b901']


In [37]:

retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

In [48]:
from langchain_ollama.chat_models import ChatOllama

model = ChatOllama(
    model="llama3.2:latest",
    temperature=0.5,
    streaming=True
)



In [49]:
from langchain.tools import tool

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=4)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [50]:
from langchain.agents import create_agent
from langsmith import traceable

tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You are a customer support agent for a laptop manufacturing company and you specialize in providing personalized answers."
    "Use the tool to help answer user queries."
)


@traceable
def run_agent(query: str):
    agent = create_agent(model, tools, system_prompt=prompt)

    for event in agent.stream(
        {"messages": [{"role": "user", "content": query}]},
        stream_mode="values",
    ):
        event["messages"][-1].pretty_print()

In [51]:
query = (
    "How to fix no power issue?"
    "Once you get the answer, look up common extensions of that method."
)


run_agent(query)


How to fix no power issue?Once you get the answer, look up common extensions of that method.
Tool Calls:
  retrieve_context (9ae7a41f-af11-459e-b58d-d7fafb210149)
 Call ID: 9ae7a41f-af11-459e-b58d-d7fafb210149
  Args:
    query: how to fix no power issue on laptop
Name: retrieve_context

Source: {'producer': 'Skia/PDF m145 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'No power Issue', 'source': 'data\\pdf\\power.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'start_index': 1203}
Content: any corrosion or damage. Clean them gently with a dry cloth if needed. Try powering the laptop with just the AC adapter connected and no battery installed. If it turns on, your battery may need

Source: {'producer': 'Skia/PDF m145 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'HP Notebook PCs - Testing and calibrating the battery (Windows)', 'source': 'data\\pdf\\battery.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'start_index': 105