# Phase 2: Document Processing Pipeline Test

This notebook tests:
1. PDFParser - extract text from PDFs
2. DocumentChunker - split text into token-aligned chunks
3. DocumentValidator - validate PDFs against constraints

**Prerequisites**: Run `00_setup.ipynb` first to install dependencies.

In [None]:
# Import dependencies
import sys
import os
from pathlib import Path

# Add src to path (if not already cloned)
if 'ttt-playground' not in os.getcwd():
    # Assume we're in Colab, clone repo
    !git clone https://github.com/sivaratrisrinivas/ttt-playground.git
    %cd ttt-playground

sys.path.insert(0, str(Path.cwd()))

from src.document.pdf_parser import PDFParser, PDFExtractionError
from src.document.chunker import DocumentChunker
from src.document.validator import DocumentValidator
from src.config import DocumentConstraints, DocumentChunk
from transformers import AutoTokenizer
import fitz  # PyMuPDF

## Step 1: Generate Test PDFs

Create test PDFs for testing:
- `test_short.pdf` (3 pages, ~1500 tokens)
- `test_medium.pdf` (20 pages, ~10K tokens)
- `test_corrupt.pdf` (invalid PDF for error testing)

In [None]:
# Generate test_short.pdf (3 pages)
def create_test_pdf(filename: str, num_pages: int, text_per_page: str):
    """Create a test PDF with specified pages and text"""
    doc = fitz.open()
    for i in range(num_pages):
        page = doc.new_page()
        # Add page number and text
        page.insert_text((50, 50), f"Page {i+1}")
        page.insert_text((50, 100), text_per_page)
    doc.save(filename)
    doc.close()
    print(f"Created {filename} ({num_pages} pages)")

# Create test PDFs
text_short = "This is a short test document. " * 50  # ~1500 chars
create_test_pdf("test_short.pdf", 3, text_short)

text_medium = "This is a medium test document with more content. " * 100  # ~5000 chars per page
create_test_pdf("test_medium.pdf", 20, text_medium)

# Create corrupt PDF (invalid bytes)
with open("test_corrupt.pdf", "wb") as f:
    f.write(b"not a valid pdf file")

print("\n✓ Test PDFs created")

## Step 2: Test PDFParser (Steps 2.2-2.3)

In [None]:
# Test PDFParser.parse()
parser = PDFParser()

# Test valid PDF
with open("test_short.pdf", "rb") as f:
    pdf_bytes = f.read()

text, page_count = parser.parse(pdf_bytes)
print(f"✓ Parsed test_short.pdf:")
print(f"  - Pages: {page_count}")
print(f"  - Text length: {len(text)} chars")
print(f"  - Text preview: {text[:100]}...")
assert page_count > 0, "Page count should be > 0"
assert len(text) > 0, "Text length should be > 0"

# Test error handling (corrupt PDF)
try:
    with open("test_corrupt.pdf", "rb") as f:
        corrupt_bytes = f.read()
    parser.parse(corrupt_bytes)
    assert False, "Should have raised PDFExtractionError"
except PDFExtractionError as e:
    print(f"\n✓ Error handling works: {type(e).__name__}")
    print(f"  Error message: {str(e)[:100]}...")

## Step 3: Test DocumentChunker (Steps 2.4-2.6)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
chunker = DocumentChunker(tokenizer, chunk_size=2048)

print(f"✓ Chunker initialized with chunk_size={chunker.chunk_size}")

# Test chunking short text (single chunk)
short_text = "This is a short text that should fit in one chunk. " * 10
chunks_short = chunker.chunk(short_text)
print(f"\n✓ Short text chunking:")
print(f"  - Chunks: {len(chunks_short)}")
print(f"  - First chunk tokens: {chunks_short[0].token_count}")

# Test chunking large text (multiple chunks)
# Create ~5000 token text
large_text = "word " * 5000
chunks_large = chunker.chunk(large_text)
print(f"\n✓ Large text chunking (~5000 tokens):")
print(f"  - Chunks: {len(chunks_large)}")
for i, chunk in enumerate(chunks_large):
    print(f"  - Chunk {i}: {chunk.token_count} tokens")
    assert chunk.token_count <= 2048, f"Chunk {i} exceeds chunk_size"

# Test token preservation
original_token_ids = tokenizer.encode(large_text, add_special_tokens=False)
reconstructed_token_ids = []
for chunk in chunks_large:
    reconstructed_token_ids.extend(chunk.token_ids)

assert reconstructed_token_ids == original_token_ids, "Tokens must be preserved!"
print(f"\n✓ Token preservation verified: {len(original_token_ids)} tokens preserved")

## Step 4: Test DocumentValidator (Step 2.7)

In [None]:
validator = DocumentValidator()
default_constraints = DocumentConstraints()

# Test valid PDF
with open("test_short.pdf", "rb") as f:
    pdf_bytes = f.read()

is_valid, error_msg = validator.validate(pdf_bytes, default_constraints)
print(f"✓ test_short.pdf validation:")
print(f"  - Valid: {is_valid}")
if error_msg:
    print(f"  - Error: {error_msg}")

# Test constraint violation (exceeds max_pages)
constraints_strict = DocumentConstraints(max_pages=2)  # test_short.pdf has 3 pages
is_valid, error_msg = validator.validate(pdf_bytes, constraints_strict)
print(f"\n✓ Constraint violation test (max_pages=2):")
print(f"  - Valid: {is_valid}")
print(f"  - Error: {error_msg}")
assert not is_valid, "Should fail validation"
assert "page" in error_msg.lower() or "exceed" in error_msg.lower()

# Test invalid PDF
with open("test_corrupt.pdf", "rb") as f:
    corrupt_bytes = f.read()

is_valid, error_msg = validator.validate(corrupt_bytes, default_constraints)
print(f"\n✓ Invalid PDF test:")
print(f"  - Valid: {is_valid}")
print(f"  - Error: {error_msg}")
assert not is_valid, "Should fail validation"

print("\n✓ All Phase 2 tests passed!")