In [7]:
"""
Advanced Document Processing: Loading, Chunking, and Embedding
This shows how to process real documents for RAG systems
"""

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

# ==============================================================================
# STEP 1: Understanding Text Splitters
# ==============================================================================
"""
Why chunk documents?
1. LLMs have token limits - can't process entire books
2. Smaller chunks = more precise retrieval
3. Better semantic focus per chunk
"""

# Sample long document
long_text = """
Artificial Intelligence (AI) has transformed numerous industries. 
In healthcare, AI assists in diagnosis and treatment planning. 
Machine learning algorithms analyze medical images to detect diseases.

In finance, AI powers fraud detection systems and algorithmic trading.
Natural language processing enables chatbots for customer service.
Computer vision is used in autonomous vehicles to perceive the environment.

Deep learning, a subset of machine learning, uses neural networks.
These networks consist of layers that process information hierarchically.
Training requires large datasets and significant computational resources.

The future of AI includes quantum computing integration.
Ethical considerations around AI bias and privacy are crucial.
Responsible AI development ensures fairness and transparency.
"""

# RecursiveCharacterTextSplitter: Splits by paragraphs, then sentences, then words
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,        # Target size of each chunk (characters)
    chunk_overlap=50,      # Overlap between chunks (prevents losing context)
    length_function=len,   # Function to measure chunk size
    separators=["\n\n", "\n", ".", " ", ""]  # Try these separators in order
)

chunks = text_splitter.split_text(long_text)

print("=" * 80)
print(f"Original text length: {len(long_text)} characters")
print(f"Number of chunks created: {len(chunks)}")
print("=" * 80)

for i, chunk in enumerate(chunks):
    print(f"\nChunk {i+1} ({len(chunk)} chars):")
    print("-" * 80)
    print(chunk)

# ==============================================================================
# STEP 2: Working with LangChain Documents
# ==============================================================================
"""
LangChain Documents have:
- page_content: The actual text
- metadata: Information about the document (source, page number, etc.)
"""

from langchain_core.documents import Document

# Create documents manually
documents = [
    Document(
        page_content="Python is a versatile programming language.",
        metadata={"source": "python_guide.txt", "page": 1, "topic": "programming"}
    ),
    Document(
        page_content="Machine learning models require training data.",
        metadata={"source": "ml_basics.txt", "page": 1, "topic": "ML"}
    ),
    Document(
        page_content="Vector databases enable semantic search.",
        metadata={"source": "databases.txt", "page": 1, "topic": "databases"}
    )
]

# Split documents while preserving metadata
doc_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20
)

split_docs = doc_splitter.split_documents(documents)

print("\n" + "=" * 80)
print("Split Documents with Metadata:")
print("=" * 80)

for i, doc in enumerate(split_docs):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")

# ==============================================================================
# STEP 3: Loading Documents from Files
# ==============================================================================
"""
LangChain provides loaders for various file types:
- TextLoader: Plain text files
- PyPDFLoader: PDF files
- CSVLoader: CSV files
- UnstructuredLoader: HTML, Word docs, etc.
"""

# Example: Create a sample text file
sample_file = "sample_doc.txt"
with open(sample_file, "w") as f:
    f.write("""
Introduction to Vector Databases

Vector databases store data as high-dimensional vectors.
These vectors capture semantic meaning of text.
They enable similarity search based on meaning, not keywords.

Applications include:
- Semantic search engines
- Recommendation systems
- Question answering systems
- Document retrieval for RAG

Popular vector databases include ChromaDB, Pinecone, and Weaviate.
""")

# Load the file
loader = TextLoader(sample_file)
loaded_docs = loader.load()

print("\n" + "=" * 80)
print("Loaded Document:")
print("=" * 80)
print(f"Number of documents: {len(loaded_docs)}")
print(f"Content preview: {loaded_docs[0].page_content[:200]}...")
print(f"Metadata: {loaded_docs[0].metadata}")

# Split the loaded document
split_loaded_docs = doc_splitter.split_documents(loaded_docs)
print(f"\nAfter splitting: {len(split_loaded_docs)} chunks")

# ==============================================================================
# STEP 4: Advanced Chunking Strategies
# ==============================================================================

# Strategy 1: Character-based splitter (what we've been using)
char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=30,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Strategy 2: Token-based splitter (better for LLM context windows)
from langchain_text_splitters import TokenTextSplitter

token_splitter = TokenTextSplitter(
    chunk_size=50,     # Number of tokens
    chunk_overlap=10   # Token overlap
)

print("\n" + "=" * 80)
print("Comparison of Splitting Strategies:")
print("=" * 80)

test_text = "AI is transforming industries. " * 20

char_chunks = char_splitter.split_text(test_text)
token_chunks = token_splitter.split_text(test_text)

print(f"Character-based splitter: {len(char_chunks)} chunks")
print(f"Token-based splitter: {len(token_chunks)} chunks")

# ==============================================================================
# STEP 5: Custom Metadata and Chunk IDs
# ==============================================================================
"""
Adding rich metadata helps with:
- Filtering results
- Source attribution
- Debugging
- Access control
"""

def create_documents_with_rich_metadata(texts, source_name):
    """Create documents with comprehensive metadata"""
    docs = []
    for i, text in enumerate(texts):
        doc = Document(
            page_content=text,
            metadata={
                "source": source_name,
                "chunk_id": i,
                "chunk_size": len(text),
                "created_at": "2024-01-15",
                "doc_type": "tutorial"
            }
        )
        docs.append(doc)
    return docs

rich_docs = create_documents_with_rich_metadata(chunks, "ai_overview.txt")

print("\n" + "=" * 80)
print("Documents with Rich Metadata:")
print("=" * 80)
for doc in rich_docs[:2]:  # Show first 2
    print(f"\nContent: {doc.page_content[:80]}...")
    print(f"Metadata: {doc.metadata}")

# Cleanup
os.remove(sample_file)

print("\n" + "=" * 80)
print("Chunking tutorial complete!")
print("=" * 80)

Original text length: 817 characters
Number of chunks created: 7

Chunk 1 (129 chars):
--------------------------------------------------------------------------------
Artificial Intelligence (AI) has transformed numerous industries. 
In healthcare, AI assists in diagnosis and treatment planning.

Chunk 2 (70 chars):
--------------------------------------------------------------------------------
Machine learning algorithms analyze medical images to detect diseases.

Chunk 3 (137 chars):
--------------------------------------------------------------------------------
In finance, AI powers fraud detection systems and algorithmic trading.
Natural language processing enables chatbots for customer service.

Chunk 4 (75 chars):
--------------------------------------------------------------------------------
Computer vision is used in autonomous vehicles to perceive the environment.

Chunk 5 (140 chars):
--------------------------------------------------------------------------------
Deep le