In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.datamodel.settings import settings
from docling_core.types.doc import DocItemLabel
from pathlib import Path
import time

def setup_debug_config(confidence_config):
    """Setup confidence filtering and debug output directory"""
    # Store original thresholds for restoration later
    original_thresholds = LayoutPostprocessor.CONFIDENCE_THRESHOLDS.copy()
    
    # Create custom thresholds - set impossible values for unwanted labels
    custom_thresholds = original_thresholds.copy()
    
    # Set impossible thresholds (>1.0) for labels not in your config
    for label in DocItemLabel:
        if label not in confidence_config:
            custom_thresholds[label] = 2.0  # Impossible threshold
        else:
            custom_thresholds[label] = confidence_config[label]
    
    return {
        'original_thresholds': original_thresholds,
        'custom_thresholds': custom_thresholds,
    }

def create_normal_converter():
    """Create standard converter with default settings"""
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["en"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=32, device=AcceleratorDevice.AUTO
    )
    
    return DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

def create_custom_converter(debug_config):
    """Create converter with custom confidence thresholds"""
    # Apply custom thresholds
    LayoutPostprocessor.CONFIDENCE_THRESHOLDS = debug_config['custom_thresholds']
    
    # Create converter with same options as normal
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["en"]
    pipeline_options.layout_options.create_orphan_clusters = False
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=32, device=AcceleratorDevice.AUTO
    )
    
    return DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

def restore_original_config(debug_config):
    """Restore original confidence thresholds"""
    LayoutPostprocessor.CONFIDENCE_THRESHOLDS = debug_config['original_thresholds']


In [3]:
LayoutPostprocessor.CONFIDENCE_THRESHOLDS

{<DocItemLabel.CAPTION: 'caption'>: 0.5,
 <DocItemLabel.FOOTNOTE: 'footnote'>: 0.5,
 <DocItemLabel.FORMULA: 'formula'>: 0.5,
 <DocItemLabel.LIST_ITEM: 'list_item'>: 0.5,
 <DocItemLabel.PAGE_FOOTER: 'page_footer'>: 0.5,
 <DocItemLabel.PAGE_HEADER: 'page_header'>: 0.5,
 <DocItemLabel.PICTURE: 'picture'>: 0.5,
 <DocItemLabel.SECTION_HEADER: 'section_header'>: 0.45,
 <DocItemLabel.TABLE: 'table'>: 0.5,
 <DocItemLabel.TEXT: 'text'>: 0.5,
 <DocItemLabel.TITLE: 'title'>: 0.45,
 <DocItemLabel.CODE: 'code'>: 0.45,
 <DocItemLabel.CHECKBOX_SELECTED: 'checkbox_selected'>: 0.45,
 <DocItemLabel.CHECKBOX_UNSELECTED: 'checkbox_unselected'>: 0.45,
 <DocItemLabel.FORM: 'form'>: 0.45,
 <DocItemLabel.KEY_VALUE_REGION: 'key_value_region'>: 0.45,
 <DocItemLabel.DOCUMENT_INDEX: 'document_index'>: 0.45}

In [5]:
input_doc_path = "testdata/06.pdf"
debug_dir = Path("testdata/debug/")

In [4]:
# import time
# normal_converter = create_normal_converter()

# start_time = time.time()
# normal_result = normal_converter.convert(input_doc_path)
# end_time = time.time() - start_time
# print(f"Document converted in {end_time:.2f} seconds.")

# doc_filename = normal_result.input.file.stem
# with (debug_dir / f"{doc_filename}_normal.md").open("w", encoding="utf-8") as fp:
#     fp.write(normal_result.document.export_to_markdown())

In [6]:
confidence_config = {
    DocItemLabel.SECTION_HEADER: 0.8,
    DocItemLabel.TEXT: 0.95,
    # Only include labels you want to keep
}
debug_config = setup_debug_config(confidence_config)
custom_converter = create_custom_converter(debug_config)

start_time = time.time()
custom_result = custom_converter.convert(input_doc_path)
end_time = time.time() - start_time
print(f"Document converted in {end_time:.2f} seconds.")

doc_filename = custom_result.input.file.stem
with (debug_dir / f"{doc_filename}_custom.md").open("w", encoding="utf-8") as fp:
    fp.write(custom_result.document.export_to_markdown())

2025-09-11 15:03:37,601 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-11 15:03:37,677 - INFO - Going to convert document batch...
2025-09-11 15:03:37,678 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 3c084c1449dda55ebb0219f601cf7b5a
2025-09-11 15:03:37,711 - INFO - Loading plugin 'docling_defaults'
2025-09-11 15:03:37,715 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-09-11 15:03:37,901 - INFO - Accelerator device: 'cuda:0'
2025-09-11 15:03:40,291 - INFO - Accelerator device: 'cuda:0'
2025-09-11 15:03:41,725 - INFO - Accelerator device: 'cuda:0'
2025-09-11 15:03:42,524 - INFO - Loading plugin 'docling_defaults'
2025-09-11 15:03:42,526 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-11 15:03:42,527 - INFO - Processing document 06.pdf
2025-09-11 15:03:52,050 - INFO - Finished converting document 06.pdf in 14.69 sec.


Document converted in 14.69 seconds.


In [25]:
item

(SectionHeaderItem(self_ref='#/texts/68', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.SECTION_HEADER: 'section_header'>, prov=[ProvenanceItem(page_no=15, bbox=BoundingBox(l=42.52, t=657.7869995117188, r=104.791, b=646.8509995117188, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 10))], orig='References', text='References', formatting=None, hyperlink=None, level=1),
 1)

In [14]:
confidence_config = {
    DocItemLabel.SECTION_HEADER: 0.8,
    DocItemLabel.TEXT: 0.95,
    # Only include labels you want to keep
}
filtered_result = filter_document_by_confidence(normal_result, confidence_config)

In [15]:
doc_filename = filtered_result.input.file.stem
with (debug_dir / f"{doc_filename}_post.md").open("w", encoding="utf-8") as fp:
    fp.write(filtered_result.document.export_to_markdown())

In [21]:
md = filtered_result.document.export_to_markdown()

In [23]:
filtered_result.document.main_text

AttributeError: 'DoclingDocument' object has no attribute 'main_text'

In [11]:
normal_result.document?

Object `normal_result.document` not found.
