<a href="https://colab.research.google.com/github/solomontessema/building-ai-agents/blob/main/notebooks/5.2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU langchain==1.1.0  langchain-community==0.4.1 "unstructured[all-docs]"


In [None]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os

# Create a directory
os.makedirs("docs", exist_ok=True)

# Write some sample content to the file
content = """
LangGraph is a library for building stateful, multi-agent applications with LLMs.
It extends the LangChain Expression Language (LCEL) by allowing for cycles and
persistence in your agent workflows. Unlike linear chains, LangGraph uses
nodes and edges to create complex, iterative processes.
RecursiveCharacterTextSplitter is a text‑splitting utility in LangChain that breaks
large documents into smaller, semantically coherent chunks by recursively trying
larger separators first (paragraphs → sentences → words → characters).
It’s the most commonly recommended splitter for RAG pipelines because it preserves
meaning better than naive fixed‑size splits.
"""

with open("docs/langgraph_intro.txt", "w") as f:
    f.write(content)

print("File created successfully!")


# Load text documents from a directory
loader = DirectoryLoader("docs/", glob="**/*.txt")
documents = loader.load()

# Clean whitespace and normalize content
cleaned_docs = [
    Document(page_content=doc.page_content.strip(), metadata=doc.metadata)
    for doc in documents
]

# Split into overlapping chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(cleaned_docs)

# Tag metadata
for chunk in chunks:
    chunk.metadata["corpus"] = "internal_knowledge_base"

print(f"Loaded {len(documents)} documents, split into {len(chunks)} chunks.")
