In [1]:
#!/usr/bin/env python3
import logging
import time
import datetime
from pathlib import Path
import base64
import xml.etree.ElementTree as ET

# Import Docling modules
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash

# Scale for image resolution (scale=1 corresponds to standard 72 DPI)
IMAGE_RESOLUTION_SCALE = 2.0

def build_xml_from_conversion(conv_res):
    """Build an XML tree from a Docling conversion result."""
    root = ET.Element("document")
    
    # Document-level metadata
    metadata = ET.SubElement(root, "metadata")
    ET.SubElement(metadata, "file_name").text = conv_res.input.file.name
    ET.SubElement(metadata, "document_hash").text = conv_res.input.document_hash
    ET.SubElement(metadata, "converted_at").text = datetime.datetime.now().isoformat()
    
    pages_el = ET.SubElement(root, "pages")
    
    # Iterate over pages using Docling's multimodal export helper.
    # This returns: (plain text, markdown, document type, cells, segments, page object)
    for (content_text, content_md, content_dt, page_cells, page_segments, page) in generate_multimodal_pages(conv_res):
        # Create a page element; note that page.page_no is zero-based.
        page_el = ET.SubElement(pages_el, "page", attrib={"number": str(page.page_no + 1)})
        
        # Add page metadata
        page_meta = ET.SubElement(page_el, "metadata")
        ET.SubElement(page_meta, "width_in_points").text = str(page.size.width)
        ET.SubElement(page_meta, "height_in_points").text = str(page.size.height)
        dpi = page._default_image_scale * 72
        ET.SubElement(page_meta, "dpi").text = str(dpi)
        
        # Add text content
        text_el = ET.SubElement(page_el, "text")
        text_el.text = content_text
        
        # Optionally, add markdown content
        md_el = ET.SubElement(page_el, "markdown")
        md_el.text = content_md
        
        # Optionally, add document type / extra content
        dt_el = ET.SubElement(page_el, "document_type")
        dt_el.text = content_dt
        
        # If page_cells are available (e.g. from table extraction), add them.
        if page_cells:
            cells_el = ET.SubElement(page_el, "cells")
            for cell in page_cells:
                cell_el = ET.SubElement(cells_el, "cell")
                # Here we assume cell is a dictionary; adjust as needed.
                for key, value in cell.items():
                    sub_el = ET.SubElement(cell_el, key)
                    sub_el.text = str(value)
        
        # If segments are available (they might include figure blocks or layout segments), add them.
        if page_segments:
            segments_el = ET.SubElement(page_el, "segments")
            for segment in page_segments:
                segment_el = ET.SubElement(segments_el, "segment")
                segment_el.text = str(segment)
        
        # Add the full-page image (if generated)
        if hasattr(page, "image") and page.image:
            image_el = ET.SubElement(page_el, "image")
            ET.SubElement(image_el, "width").text = str(page.image.width)
            ET.SubElement(image_el, "height").text = str(page.image.height)
            # Encode image bytes as base64 to safely embed in XML
            encoded_image = base64.b64encode(page.image.tobytes()).decode('utf-8')
            ET.SubElement(image_el, "data").text = encoded_image
        
        # Optionally, if the page object has figures, tables, or line numbers (depending on your Docling version), add them.
        if hasattr(page, "figures") and page.figures:
            figures_el = ET.SubElement(page_el, "figures")
            for fig in page.figures:
                fig_el = ET.SubElement(figures_el, "figure")
                if hasattr(fig, "caption"):
                    ET.SubElement(fig_el, "caption").text = fig.caption
                if hasattr(fig, "bbox"):
                    ET.SubElement(fig_el, "bbox").text = str(fig.bbox)
        if hasattr(page, "tables") and page.tables:
            tables_el = ET.SubElement(page_el, "tables")
            for tbl in page.tables:
                tbl_el = ET.SubElement(tables_el, "table")
                if hasattr(tbl, "caption"):
                    ET.SubElement(tbl_el, "caption").text = tbl.caption
                if hasattr(tbl, "data"):
                    ET.SubElement(tbl_el, "data").text = str(tbl.data)
        if hasattr(page, "line_numbers") and page.line_numbers:
            lines_el = ET.SubElement(page_el, "line_numbers")
            for ln in page.line_numbers:
                ln_el = ET.SubElement(lines_el, "line")
                ln_el.text = str(ln)
    
    return root

def main():
    logging.basicConfig(level=logging.INFO)
    
    # Input PDF file (adjust the path as necessary)
    input_doc_path = Path("applsci-3508831-peer-review-v1.pdf")
    # Output XML file
    output_xml_path = Path("applsci-3508831-peer-review-v1.xml")
    
    # Set up pipeline options for PDF conversion.
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True  # Needed to preserve images (e.g. figures)
    
    # Create the DocumentConverter with PDF options.
    doc_converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    
    start_time = time.time()
    logging.info("Starting PDF conversion...")
    
    # Convert the PDF document
    conv_res = doc_converter.convert(input_doc_path)
    
    # Build XML tree from conversion result
    xml_root = build_xml_from_conversion(conv_res)
    tree = ET.ElementTree(xml_root)
    tree.write(output_xml_path, encoding="utf-8", xml_declaration=True)
    
    elapsed = time.time() - start_time
    logging.info(f"PDF converted to XML in {elapsed:.2f} seconds. Output saved to {output_xml_path}")

if __name__ == "__main__":
    main()

  from pandas.core import (
  warn(f"Failed to load image Python extension: {e}")
INFO:root:Starting PDF conversion...
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document applsci-3508831-peer-review-v1.pdf
INFO:docling.document_converter:Finished converting document applsci-3508831-peer-review-v1.pdf in 46.81 sec.
INFO:root:PDF converted to XML in 47.47 seconds. Output saved to applsci-3508831-peer-review-v1.xml
