In [1]:
import os
import sys
import json
import pathlib
from glob import glob
import numpy as np
import pandas as pd
from collections import defaultdict
import time

# Image processing and SSIM evaluation
from PIL import Image
import cv2
from skimage import io
from skimage.util import img_as_ubyte
from skimage.color import gray2rgb, rgb2gray, rgba2rgb
from skimage.transform import resize
from skimage.metrics import structural_similarity as ssim

# Visualization
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from tqdm.notebook import tqdm
from tqdm import tqdm as tqdm_base

# Add source to path for importing PDFExtractor
sys.path.insert(0, '/media/jcardenuto/Windows/Users/phill/work/2025-elis-system/system_modules/pdf-image-extraction')
from src.pdf_image_extraction.core.extractor import PDFExtractor

print("✓ All libraries imported successfully")

✓ All libraries imported successfully


In [2]:
# Configuration
BASE_PATH = '../pdf-content-extraction-data/'
REFERENCE_PATH = os.path.join(BASE_PATH, 'pdf-content-extraction-data/figures/')
EXTRACTED_PATH = 'xfigs/'  # Output directory for new version
ANNOTATION_FILE = os.path.join(BASE_PATH, 'pdf-content-extraction-annotation.json')

# SSIM evaluation parameters
MATCH_THRESH = 0.55
USE_ORIGINAL_SIZE = False
RESIZE = (512, 512)

# Create output directories
os.makedirs(EXTRACTED_PATH, exist_ok=True)
os.makedirs(os.path.join(EXTRACTED_PATH, 'results'), exist_ok=True)

# Load annotations
with open(ANNOTATION_FILE) as file:
    annotations = json.load(file)

print(f"✓ Loaded {len(annotations)} PDF annotations")
print(f"✓ Reference images path: {REFERENCE_PATH}")
print(f"✓ New extraction output path: {EXTRACTED_PATH}")

# Get list of PDFs
base_path_full = os.path.join(BASE_PATH, 'pdf-content-extraction-data')
pdfs_dir = os.path.join(base_path_full, 'pdfs')
pdf_files = sorted(glob(os.path.join(pdfs_dir, '*.pdf')))
print(f"✓ Found {len(pdf_files)} PDF files to process")

✓ Loaded 285 PDF annotations
✓ Reference images path: ../pdf-content-extraction-data/pdf-content-extraction-data/figures/
✓ New extraction output path: xfigs/
✓ Found 285 PDF files to process


In [3]:
# Load reference images from xfigs
reference_images = {}
reference_pdfs = sorted(glob(os.path.join(REFERENCE_PATH, 'PDF-*')))

for pdf_path in reference_pdfs:
    pdf_id = os.path.basename(pdf_path)
    reference_images[pdf_id] = sorted(glob(os.path.join(pdf_path, '*.*')))

print(f"✓ Loaded reference images from {len(reference_pdfs)} PDFs")
total_ref_images = sum(len(imgs) for imgs in reference_images.values())
print(f"✓ Total reference images: {total_ref_images}")

✓ Loaded reference images from 285 PDFs
✓ Total reference images: 1876


In [10]:
def extract_pdf_images(pdf_path, output_dir, mode='normal'):
    """
    Extract images from a PDF using the new PDFExtractor
    
    Args:
        pdf_path: Path to PDF file
        output_dir: Directory to save extracted images
        mode: Extraction mode ('safe', 'normal', 'unsafe')
    
    Returns:
        Dictionary with extraction metadata
    """
    pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_output_dir = os.path.join(output_dir, pdf_id)
    os.makedirs(pdf_output_dir, exist_ok=True)
    
    metadata = {
        'pdf_id': pdf_id,
        'pdf_path': pdf_path,
        'mode': mode,
        'images_extracted': 0,
        'extraction_time': 0,
        'error': None
    }
    
    try:
        start_time = time.time()
        extractor = PDFExtractor(pdf_path)
        
        # Use the correct method based on mode
        if mode == 'safe':
            extractor.safe_mode(pdf=pdf_path, dir_path=pdf_output_dir)
        elif mode == 'unsafe':
            extractor.unsafe_mode(pdf=pdf_path, dir_path=pdf_output_dir)
        else:  # normal
            extractor.normal_mode(pdf=pdf_path, dir_path=pdf_output_dir)
        
        metadata['extraction_time'] = time.time() - start_time
        
        # Count extracted images
        extracted_files = glob(os.path.join(pdf_output_dir, '*.png'))
        metadata['images_extracted'] = len(extracted_files)
        
    except Exception as e:
        metadata['error'] = str(e)
        print(f"  ⚠ Error extracting {pdf_id}: {str(e)[:100]}")
    
    return metadata

# Extract images from all PDFs using the new version
extraction_results = []

print("Extracting images using new PyMuPDF 1.26.6 version (normal mode)...")
print("=" * 70)

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):  # Process first 10 as sample
    result = extract_pdf_images(pdf_path, EXTRACTED_PATH, mode='normal')
    extraction_results.append(result)
    
    if result['error']:
        print(f"  ✗ {result['pdf_id']}: {result['error']}")
    else:
        print(f"  ✓ {result['pdf_id']}: {result['images_extracted']} images, {result['extraction_time']:.2f}s")

# Summary
successful = sum(1 for r in extraction_results if not r['error'])
total_extracted = sum(r['images_extracted'] for r in extraction_results)
avg_time = np.mean([r['extraction_time'] for r in extraction_results if r['extraction_time'] > 0])

print("=" * 70)
print(f"✓ Extraction Summary:")
print(f"  - Successful: {successful}/{len(extraction_results)}")
print(f"  - Total images extracted: {total_extracted}")
print(f"  - Average extraction time: {avg_time:.2f}s")

Extracting images using new PyMuPDF 1.26.6 version (normal mode)...


Processing PDFs:   0%|          | 0/285 [00:00<?, ?it/s]

  ✓ PDF-00001: 6 images, 0.84s
  ✓ PDF-00002: 5 images, 0.52s
  ✓ PDF-00003: 5 images, 0.66s
  ✓ PDF-00004: 11 images, 0.55s
  ✓ PDF-00005: 10 images, 0.18s
  ✓ PDF-00006: 10 images, 0.24s
  ✓ PDF-00007: 7 images, 0.26s
  ✓ PDF-00008: 2 images, 0.25s
  ✓ PDF-00009: 8 images, 0.26s
  ✓ PDF-00010: 5 images, 1.45s
  ✓ PDF-00011: 11 images, 0.39s
  ✓ PDF-00012: 12 images, 0.23s
  ✓ PDF-00013: 10 images, 0.26s
  ✓ PDF-00014: 66 images, 0.20s
  ✓ PDF-00015: 7 images, 0.18s
  ✓ PDF-00016: 70 images, 0.35s
  ✓ PDF-00017: 10 images, 1.56s
  ✓ PDF-00018: 11 images, 0.34s
  ✓ PDF-00019: 10 images, 0.45s
  ✓ PDF-00020: 25 images, 0.26s
  ✓ PDF-00021: 13 images, 0.50s
  ✓ PDF-00022: 11 images, 1.21s
  ✓ PDF-00023: 9 images, 1.42s
  ✓ PDF-00024: 7 images, 1.10s
  ✓ PDF-00025: 2 images, 0.45s
  ✓ PDF-00026: 4 images, 0.06s
  ✓ PDF-00027: 3 images, 0.19s
  ✓ PDF-00028: 4 images, 0.18s
  ✓ PDF-00029: 8 images, 0.42s
  ✓ PDF-00030: 9 images, 0.34s
  ✓ PDF-00031: 10 images, 0.66s
  ✓ PDF-00032: 5 images,