# Workout: Document Processing

## Setup
```bash
uv add pymupdf tiktoken beautifulsoup4
```

---
## Drill 1: PDF Text Extraction 游릭
**Task:** Extract text from a PDF

In [None]:
import fitz

def extract_pdf_text(path: str) -> str:
    """Extract all text from PDF."""
    pass

# Test (you'll need a sample PDF)
# text = extract_pdf_text("sample.pdf")
# print(text[:500])

---
## Drill 2: Fixed-Size Chunking 游릭
**Task:** Implement basic chunking

In [None]:
def chunk_by_chars(text: str, size: int = 500, overlap: int = 100) -> list[str]:
    """Chunk text by character count with overlap."""
    pass

text = "A" * 1000 + "B" * 1000 + "C" * 500
chunks = chunk_by_chars(text, size=500, overlap=100)
print(f"Chunks: {len(chunks)}")
for i, c in enumerate(chunks):
    print(f"Chunk {i}: {len(c)} chars, starts with {c[:5]}")

---
## Drill 3: Recursive Splitting 游리
**Task:** Split on natural boundaries

In [None]:
def recursive_split(
    text: str,
    chunk_size: int = 500,
    separators: list[str] = None
) -> list[str]:
    """Split on paragraphs, then sentences, then characters."""
    if separators is None:
        separators = ["\n\n", "\n", ". ", " "]
    pass

text = """First paragraph with multiple sentences. This is another sentence.

Second paragraph is here. It also has sentences.

Third paragraph is the last one."""

chunks = recursive_split(text, chunk_size=100)
for i, c in enumerate(chunks):
    print(f"Chunk {i}: {c[:50]}...")

---
## Drill 4: Token-Based Chunking 游리
**Task:** Chunk by token count

In [None]:
import tiktoken

def chunk_by_tokens(
    text: str,
    max_tokens: int = 100,
    overlap_tokens: int = 20
) -> list[str]:
    """Chunk by token count."""
    pass

text = "This is a test. " * 100
chunks = chunk_by_tokens(text, max_tokens=50, overlap_tokens=10)
print(f"Chunks: {len(chunks)}")

---
## Drill 5: Metadata Extraction 游릭
**Task:** Extract file metadata

In [None]:
from pathlib import Path
from datetime import datetime

def extract_file_metadata(path: str) -> dict:
    """Extract basic file metadata."""
    pass

# Test
# meta = extract_file_metadata("any_file.txt")
# print(meta)
# Should include: filename, extension, size, modified date

---
## Drill 6: HTML to Text 游리
**Task:** Clean HTML content

In [None]:
from bs4 import BeautifulSoup

def html_to_text(html: str) -> str:
    """Convert HTML to clean text."""
    pass

html = """
<html>
<head><title>Test</title></head>
<body>
<h1>Hello</h1>
<p>This is a <b>paragraph</b>.</p>
<script>alert('removed')</script>
</body>
</html>
"""

print(html_to_text(html))
# Should print: Hello\nThis is a paragraph.

---
## Drill 7: Chunk with Metadata 游리
**Task:** Add metadata to chunks

In [None]:
def chunk_with_metadata(
    content: str,
    source: str,
    chunk_size: int = 500
) -> list[dict]:
    """Return chunks with metadata."""
    pass

# Each chunk should have:
# - content
# - source
# - chunk_index
# - total_chunks
# - char_start
# - char_end

chunks = chunk_with_metadata("A" * 1000, "test.txt", 300)
for c in chunks:
    print(c)

---
## Drill 8: Clean Text 游릭
**Task:** Clean text for embedding

In [None]:
import re

def clean_for_embedding(text: str) -> str:
    """Clean text: remove extra whitespace, special chars."""
    pass

dirty = "  Hello    World!!!   \n\n\n  Test  "
clean = clean_for_embedding(dirty)
print(f"'{clean}'")
# Should print: 'Hello World!!! Test'

---
## Drill 9: Document Processor Class 游댮
**Task:** Build complete processing pipeline

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Chunk:
    content: str
    metadata: dict

class DocumentProcessor:
    def __init__(self, chunk_size: int = 500, overlap: int = 100):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def load(self, path: str) -> str:
        """Load document (TXT, MD, PDF)."""
        pass

    def process(self, path: str) -> list[Chunk]:
        """Load, clean, chunk, and add metadata."""
        pass

# Test
# processor = DocumentProcessor(chunk_size=200)
# chunks = processor.process("sample.txt")

---
## Drill 10: Batch Directory Processing 游댮
**Task:** Process all documents in a directory

In [None]:
from pathlib import Path
from typing import Iterator

def process_directory(
    dir_path: str,
    extensions: list[str] = [".txt", ".md", ".pdf"],
    chunk_size: int = 500
) -> Iterator[dict]:
    """Yield chunks from all documents in directory."""
    pass

# Usage
# for chunk in process_directory("./docs"):
#     print(f"{chunk['source']}: {chunk['content'][:50]}...")

---
## Self-Check

- [ ] Can extract text from PDF, HTML, Markdown
- [ ] Understand different chunking strategies
- [ ] Can implement chunking with overlap
- [ ] Can extract and attach metadata
- [ ] Can build end-to-end processing pipelines