# PDF Processing Pipeline

This notebook demonstrates the PDF processing workflow:
1. Extract text from PDFs
2. Chunk the text
3. Summarize chunks with Ollama
4. Save processed chunks


In [None]:
# Setup: Add Chicago directory to path
import sys
from pathlib import Path

project_root = Path().parent
chicago_dir = project_root / "Chicago"
sys.path.insert(0, str(chicago_dir))

print(f"Project root: {project_root}")
print(f"Chicago directory: {chicago_dir}")


In [None]:
# Import PDF pipeline functions
from pdf_pipeline import process_pdf, extract_pdf_text, chunk_text, summarize_with_ollama


In [None]:
# Set up PDF path
pdf_path = project_root / "Chicago" / "Data" / "Raw" / "Chicago_Timeline_Honorary_Chicago.pdf"

if pdf_path.exists():
    print(f"Found PDF: {pdf_path}")
else:
    print(f"PDF not found: {pdf_path}")
    print("Available PDFs:")
    raw_dir = project_root / "Chicago" / "Data" / "Raw"
    if raw_dir.exists():
        for f in raw_dir.glob("*.pdf"):
            print(f"  - {f.name}")


In [None]:
# Process the PDF
if pdf_path.exists():
    chunks = process_pdf(pdf_path, save_chunks=True)
    print(f"\nProcessed {len(chunks)} chunks")


In [None]:
# View a sample chunk
if 'chunks' in locals() and chunks:
    print("Sample chunk:")
    print(f"ID: {chunks[0]['id']}")
    print(f"Position: {chunks[0]['chunk_position']}")
    print(f"\nSummary:\n{chunks[0]['summary']}")
    print(f"\nText preview (first 200 chars):\n{chunks[0]['text'][:200]}...")
