In [None]:
from pathlib import Path
from pypdf import PdfReader
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
scope_dir = Path("scope")
print(f"Exploring: {scope_dir.absolute()}")
print()

Exploring: /workspace/tasks/financial_ner/data/scope



In [None]:
# Get all PDF files
pdf_files = sorted(scope_dir.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files:
    print(f"  - {pdf.name}")
print()

Found 2 PDF files:
  - a10-k20189292018.pdf
  - msft-10k_20190630.pdf



In [None]:
pdf_stats = []

for pdf_file in pdf_files:
    print(f"Processing: {pdf_file.name}")
    try:
        reader = PdfReader(pdf_file)
        pdf_text = "\n".join(page.extract_text() or "" for page in reader.pages)
        
        char_count = len(pdf_text)
        page_count = len(reader.pages)
        
        pdf_stats.append({
            "filename": pdf_file.name,
            "pages": page_count,
            "characters": char_count,
            "avg_chars_per_page": char_count // page_count if page_count > 0 else 0
        })
        
        print(f"  Pages: {page_count:,}")
        print(f"  Characters: {char_count:,}")
        print(f"  Avg chars/page: {char_count // page_count:,}")
        print()
        
    except Exception as e:
        print(f"  Error: {e}")
        print()

Processing: a10-k20189292018.pdf
  Pages: 232
  Characters: 563,630
  Avg chars/page: 2,429

Processing: msft-10k_20190630.pdf
  Pages: 146
  Characters: 475,232
  Avg chars/page: 3,255

