In [15]:
!pip install -qU langchain langchain-community langchain-chroma
!pip install -qU sentence-transformers
!pip install -qU tiktoken

# For free LLM model
!pip install -qU transformers accelerate sentencepiece



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from langchain_community.document_loaders import WebBaseLoader

url = "https://python.langchain.com/docs/tutorials/rag/"
loader = WebBaseLoader(url)

raw_docs = loader.load()

print("Number of documents:", len(raw_docs))
print("Sample content:\n", raw_docs[0].page_content[:500])


Number of documents: 1
Sample content:
 Build a RAG agent with LangChain - Docs by LangChainSkip to main contentðŸš€ Share how you're building agents for a chance to win LangChain swag!Docs by LangChain home pageLangChain + LangGraphSearch...âŒ˜KAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationLangChainBuild a RAG agent with LangChainLangChainLangGraphDeep AgentsIntegrationsLearnReferenceContributePythonLearnTutorialsLangChainSemantic searchRAG agentSQL agentSupervisor agentLangGraphConceptual overviewsComponent architectureMemoryC


In [18]:
def clean_text(text: str) -> str:
    # remove line breaks
    text = text.replace("\n", " ").strip()
    # collapse multiple spaces into one
    while "  " in text:
        text = text.replace("  ", " ")
    return text

# apply cleaning to every document
for doc in raw_docs:
    doc.page_content = clean_text(doc.page_content)

print("After cleaning:\n")
print(raw_docs[0].page_content[:300])


After cleaning:

Build a RAG agent with LangChain - Docs by LangChainSkip to main contentðŸš€ Share how you're building agents for a chance to win LangChain swag!Docs by LangChain home pageLangChain + LangGraphSearch...âŒ˜KAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationLangChainBuild a RAG agent with LangChainL


In [19]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,      
    chunk_overlap=200,   
)

chunks = text_splitter.split_documents(raw_docs)

print("Total chunks:", len(chunks))
print("\nSample chunk:\n", chunks[0].page_content[:400])


Total chunks: 36

Sample chunk:
 Build a RAG agent with LangChain - Docs by LangChainSkip to main contentðŸš€ Share how you're building agents for a chance to win LangChain swag!Docs by LangChain home pageLangChain + LangGraphSearch...âŒ˜KAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationLangChainBuild a RAG agent with LangChainLangChainLangGraphDeep AgentsIntegrationsLearnReferenceContributePythonLearnTutorialsLangChainSemanti


In [21]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# free embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Quick test
test_vector = embedding_model.embed_query("What is Retrieval-Augmented Generation?")
print("Embedding length:", len(test_vector))
print("First 5 numbers:", test_vector[:5])


Embedding length: 384
First 5 numbers: [-0.11101916432380676, -0.026313427835702896, -0.05789431557059288, 0.05978523567318916, -0.020831575617194176]


In [22]:
from langchain_chroma import Chroma

# create vector store from your chunks
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    collection_name="langchain_rag_docs"
)

print("âœ… Vector store created!")

# --- test retrieval ---
query = "What are the main steps to build a RAG pipeline?"
docs_retrieved = vectorstore.similarity_search(query, k=3)

print("\nRetrieved", len(docs_retrieved), "chunks.")
print("\nSample retrieved chunk:\n")
print(docs_retrieved[0].page_content[:500])


âœ… Vector store created!

Retrieved 3 chunks.

Sample retrieved chunk:

Build a RAG agent with LangChain - Docs by LangChainSkip to main contentðŸš€ Share how you're building agents for a chance to win LangChain swag!Docs by LangChain home pageLangChain + LangGraphSearch...âŒ˜KAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationLangChainBuild a RAG agent with LangChainLangChainLangGraphDeep AgentsIntegrationsLearnReferenceContributePythonLearnTutorialsLangChainSemantic searchRAG agentSQL agentSupervisor agentLangGraphConceptual overviewsComponent architectureMemoryC


In [24]:
# turn vector store into a retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",      # basic similarity search
    search_kwargs={"k": 3}         # return top 3 chunks
)

# test retriever
test_query = "How does chunking work in a RAG pipeline?"
retrieved_docs = retriever.invoke(test_query)

print("Retriever returned", len(retrieved_docs), "docs.\n")
print("First doc sample:\n")
print(retrieved_docs[0].page_content[:500])


Retriever returned 3 docs.

First doc sample:

'9000bf8e-1993-446f-8d4d-f4e507ba4b8f', 'ba3b5d14-bed9-4f5f-88be-44c88aedc2e6'] Go deeper Embeddings: Wrapper around a text embedding model, used for converting text to embeddings. Integrations: 30+ integrations to choose from. Interface: API reference for the base interface. VectorStore: Wrapper around a vector database, used for storing and querying embeddings. Integrations: 40+ integrations to choose from. Interface: API reference for the base interface. This completes the Indexing portion of


In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model (this one is heavier than previous steps)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"   # uses GPU if available, else CPU
)

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256
)

print("âœ… TinyLlama loaded and ready!")


Loading tokenizer...
Loading model (this one is heavier than previous steps)...


Device set to use cpu


âœ… TinyLlama loaded and ready!


In [26]:
def format_docs(docs):
    # Join all retrieved chunks with separators
    return "\n\n---\n\n".join(d.page_content for d in docs)

system_instruction = (
    "You are an assistant that answers questions about LangChain's RAG tutorial. "
    "Use ONLY the context below. If you are not sure from the context, say you are not sure."
)

def rag_answer(question: str) -> str:
    # 1. Retrieve relevant chunks
    docs = retriever.invoke(question)
    context = format_docs(docs)

    # 2. Build prompt for TinyLlama
    prompt = (
        f"{system_instruction}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n"
        f"Answer:"
    )

    # 3. Call LLM
    outputs = llm_pipeline(prompt)[0]["generated_text"]

    # 4. Simple cleanup: return text after "Answer:"
    if "Answer:" in outputs:
        return outputs.split("Answer:", 1)[-1].strip()
    else:
        return outputs.strip()

print("âœ… RAG function defined.")


âœ… RAG function defined.


In [27]:
def format_docs(docs):
    return "\n\n---\n\n".join(d.page_content for d in docs)

system_instruction = (
    "You are an assistant that answers questions about LangChain's RAG tutorial. "
    "Use ONLY the context below. "
    "Give a short, clear answer in 3â€“6 sentences. "
    "Do not ask new questions. Do not start new Q&A pairs."
)

def rag_answer(question: str) -> str:
    # 1. Retrieve relevant chunks
    docs = retriever.invoke(question)
    context = format_docs(docs)

    # 2. Build prompt
    prompt = (
        f"{system_instruction}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n"
        f"Answer:"
    )

    # 3. Call TinyLlama
    outputs = llm_pipeline(prompt)[0]["generated_text"]

    # 4. Take only the text after "Answer:"
    text = outputs.split("Answer:", 1)[-1].strip()

    # 5. If it starts adding another "Question:", cut it off
    if "Question:" in text:
        text = text.split("Question:", 1)[0].strip()

    return text


In [28]:
#print("Q: What is Retrieval-Augmented Generation (RAG)?")
#print("A:", rag_answer("What is Retrieval-Augmented Generation (RAG)?"))
print("Q: langchain?")
print("A:", rag_answer("langchain"))


Q: langchain?
A: Add a key to the state to store the retrieved documents and add a new node via a pre-model hook to populate that key (as well as inject the context).
