# PDF Ingestion & Processing Test

Test PDF processing pipeline with sample research papers.

In [3]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))

#got to home directory
import os
os.chdir(Path.cwd().parent)
from src.pdf_processing.pdf_processor import PDFProcessor
from src.pdf_processing.chunker import SmartChunker
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (c:\Users\sambi\anaconda3\envs\chat_env\Lib\site-packages\pdfminer\utils.py)

## Step 1: Download Sample Paper

For testing, we'll use a sample arXiv paper

In [None]:
# Create sample data directory
from pathlib import Path
sample_dir = Path("./data/sample_papers")
sample_dir.mkdir(parents=True, exist_ok=True)

# For testing, you can:
# 1. Download from arXiv: https://arxiv.org/pdf/2303.08774.pdf (GPT-4)
# 2. Place in ./data/sample_papers/

print(f"Sample papers directory: {sample_dir}")
print(f"Place PDF files here for testing")

## Step 2: Process PDF

In [None]:
# Initialize processor
processor = PDFProcessor(extract_images=False, extract_tables=True)

# Process a test PDF
test_pdf = "./data/sample_papers/sample_paper.pdf"  # Replace with actual PDF

try:
    elements, metadata = processor.process_pdf(test_pdf)
    print(f"✓ Processed {len(elements)} elements")
    print(f"  Metadata: {metadata}")
    
    # Show first few elements
    print("\nFirst 5 elements:")
    for i, elem in enumerate(elements[:5]):
        print(f"{i+1}. [{elem.__class__.__name__}] {elem.text[:100]}...")
except FileNotFoundError as e:
    print(f"Please download a sample PDF first: {e}")

## Step 3: Extract Structured Content

In [None]:
# Extract structured content
structured = processor.extract_structured(test_pdf)

print(f"Titles: {len(structured['titles'])}")
print(f"Headings: {len(structured['headings'])}")
print(f"Tables: {len(structured['tables'])}")
print(f"Narrative sections: {len(structured['narrative'])}")

print("\nFirst title:", structured['titles'][0] if structured['titles'] else "N/A")
print("First heading:", structured['headings'][0] if structured['headings'] else "N/A")

## Step 4: Chunk Content

In [None]:
# Initialize chunker
chunker = SmartChunker(chunk_size=512, overlap=128)

# Chunk elements
chunks = chunker.chunk_elements(elements, "sample_paper")

print(f"✓ Created {len(chunks)} chunks\n")

# Display chunk statistics
chunk_sizes = [len(chunk.text.split()) for chunk in chunks]
print(f"Chunk sizes (words): min={min(chunk_sizes)}, max={max(chunk_sizes)}, avg={sum(chunk_sizes)/len(chunk_sizes):.0f}")

# Show first chunk
print(f"\nFirst chunk:")
print(f"  Text: {chunks[0].text[:200]}...")
print(f"  Page: {chunks[0].page_number}")
print(f"  Section: {chunks[0].section}")
print(f"  Type: {chunks[0].element_type}")