### References
1. https://docling-project.github.io/docling/examples/minimal/
2. https://github.com/youssefHosni/To-Data-Beyond-/tree/main/LocalOCR_Application_SmolDocling

In [1]:
import logging
import time
from pathlib import Path

In [2]:
import pandas as pd

from docling.document_converter import DocumentConverter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
_log = logging.getLogger(__name__)

In [6]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("Data/2308.08859v2.pdf")
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)

    doc_filename = conv_res.input.file.stem

    # Export tables
    for table_ix, table in enumerate(conv_res.document.tables):
        table_df: pd.DataFrame = table.export_to_dataframe()
        print(f"## Table {table_ix}")
        print(table_df.to_markdown())

        # Save the table as csv
        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
        _log.info(f"Saving CSV table to {element_csv_filename}")
        table_df.to_csv(element_csv_filename)

        # Save the table as html
        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
        _log.info(f"Saving HTML table to {element_html_filename}")
        with element_html_filename.open("w") as fp:
            fp.write(table.export_to_html())

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")

In [7]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 3d2abd0e021741887551c73bd132b421
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:easyocr.easyocr:Download complete
INFO:easyocr.easyocr:Download complete.
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document 2308.08859v2.pdf
INFO:docling.document_converter:Finished converting document 2308.08859v2.pdf in 94.67 sec.
INFO:__main__:Saving CSV 

## Table 0
|    | Element pair   | η [a - 2 0 ]                      |
|---:|:---------------|:----------------------------------|
|  0 | H-H            | 0.000, 0.006, 0.016, 0.038, 0.099 |
|  1 | H-O            | 0.000, 0.007, 0.019, 0.051, 0.166 |
|  2 | H-Li           | 0.000, 0.005, 0.012, 0.025, 0.052 |
|  3 | O-O            | 0.000, 0.004, 0.008, 0.015, 0.027 |
|  4 | O-Li           | 0.000, 0.005, 0.012, 0.024, 0.051 |
