In [10]:
#!/usr/bin/env python3
import logging
import time
import base64
import datetime
from pathlib import Path
import xml.etree.ElementTree as ET

from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)

def build_xml_from_document(document):
    """
    Build an XML tree representing the parsed PDF document.
    The XML includes:
      - Document metadata.
      - Pages (with text, images, tables, and segments when available).
      - Overall document content.
    """
    # Create the root element
    root = ET.Element("document")
    
    # Add metadata
    metadata_el = ET.SubElement(root, "metadata")
    if hasattr(document, 'metadata') and document.metadata:
        for key, value in document.metadata.items():
            meta_item = ET.SubElement(metadata_el, key)
            meta_item.text = str(value)
    # Record conversion timestamp
    ET.SubElement(metadata_el, "converted_at").text = datetime.datetime.now().isoformat()
    
    # Handle pages (convert to list if document.pages is a dict)
    pages = document.pages if isinstance(document.pages, list) else list(document.pages.values())
    pages_el = ET.SubElement(root, "pages")
    for i, page in enumerate(pages):
        page_el = ET.SubElement(pages_el, "page", attrib={"number": str(i+1)})
        
        # Add page-level metadata (if available)
        if hasattr(page, "size"):
            page_meta = ET.SubElement(page_el, "metadata")
            ET.SubElement(page_meta, "width_in_points").text = str(page.size.width)
            ET.SubElement(page_meta, "height_in_points").text = str(page.size.height)
            if hasattr(page, "_default_image_scale"):
                dpi = page._default_image_scale * 72
                ET.SubElement(page_meta, "dpi").text = str(dpi)
        
        # Page text content
        if hasattr(page, "text") and page.text:
            text_el = ET.SubElement(page_el, "text")
            text_el.text = page.text
        
        # Page image (if generated)
        if hasattr(page, "image") and page.image:
            image_el = ET.SubElement(page_el, "image")
            ET.SubElement(image_el, "width").text = str(page.image.width)
            ET.SubElement(image_el, "height").text = str(page.image.height)
            # Encode image bytes as base64 for safe XML embedding
            encoded_image = base64.b64encode(page.image.tobytes()).decode('utf-8')
            ET.SubElement(image_el, "data").text = encoded_image
        
        # Optional: If the page contains tables (e.g., as extracted cells or table objects)
        if hasattr(page, "tables") and page.tables:
            tables_el = ET.SubElement(page_el, "tables")
            for j, table in enumerate(page.tables):
                table_el = ET.SubElement(tables_el, "table", attrib={"id": str(j+1)})
                if hasattr(table, "caption") and table.caption:
                    ET.SubElement(table_el, "caption").text = table.caption
                if hasattr(table, "data") and table.data:
                    ET.SubElement(table_el, "data").text = str(table.data)
        
        # Optional: If the page includes segments (e.g., layout segments or other markers)
        if hasattr(page, "segments") and page.segments:
            segments_el = ET.SubElement(page_el, "segments")
            for segment in page.segments:
                seg_el = ET.SubElement(segments_el, "segment")
                seg_el.text = str(segment)
    
    # Add overall document content if available
    if hasattr(document, "content") or hasattr(document, "text"):
        content = getattr(document, "content", None) or getattr(document, "text", "")
        content_el = ET.SubElement(root, "content")
        content_el.text = content
    
    return root

def main():
    logging.basicConfig(level=logging.INFO)
    
    # Specify the input PDF and output XML file paths.
    input_pdf_path = Path("./applsci-3508831-peer-review-v1.pdf")
    output_xml_path = Path("applsci-3508831-peer-review-v1.xml")
    
    # Create a DocumentConverter instance (ensure Docling is installed and configured)
    doc_converter = DocumentConverter()
    
    _log.info(f"Starting conversion of {input_pdf_path}")
    start_time = time.time()
    
    # Convert the PDF into a document object
    conv_res = doc_converter.convert(input_pdf_path)
    document = conv_res.document
    
    # Build the XML tree from the document
    xml_root = build_xml_from_document(document)
    tree = ET.ElementTree(xml_root)
    
    # Save the XML to file with XML declaration and UTF-8 encoding
    tree.write(output_xml_path, encoding="utf-8", xml_declaration=True)
    
    elapsed = time.time() - start_time
    _log.info(f"PDF conversion and XML export completed in {elapsed:.2f} seconds.")
    _log.info(f"XML file saved at: {output_xml_path}")

if __name__ == "__main__":
    main()

INFO:__main__:Starting conversion of applsci-3508831-peer-review-v1.pdf
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document applsci-3508831-peer-review-v1.pdf
INFO:docling.document_converter:Finished converting document applsci-3508831-peer-review-v1.pdf in 46.89 sec.
INFO:__main__:PDF conversion and XML export completed in 46.90 seconds.
INFO:__main__:XML file saved at: applsci-3508831-peer-review-v1.xml
