# Early Testing Docling 

In [1]:
import json
import logging
import time
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

In [17]:
type(conv_result.document)

docling_core.types.doc.document.DoclingDocument

In [21]:
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem

In [3]:
_log = logging.getLogger(__name__)

In [4]:
# Docling Parse with EasyOCR
# ----------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["en"]
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=32, device=AcceleratorDevice.AUTO
)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [80]:
pipeline_options.layout_options

LayoutOptions(create_orphan_clusters=True, keep_empty_clusters=False, model_spec=LayoutModelConfig(name='docling_layout_heron', repo_id='ds4sd/docling-layout-heron', revision='main', model_path='', supported_devices=[<AcceleratorDevice.CPU: 'cpu'>, <AcceleratorDevice.CUDA: 'cuda'>, <AcceleratorDevice.MPS: 'mps'>]), skip_cell_assignment=False)

In [5]:
data_folder = Path("/workspace/extraction/testdata/")
doc_paths = []
if data_folder.exists() and data_folder.is_dir():
    files_found = []
    for item in data_folder.iterdir():
        if item.is_file():
            doc_paths.append(item)
doc_paths = sorted(doc_paths)

## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)

In [74]:
start_time = time.time()
conv_result = doc_converter.convert(doc_paths[0])
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")

2025-09-11 04:26:21,060 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-11 04:26:21,763 - INFO - Going to convert document batch...
2025-09-11 04:26:21,765 - INFO - Processing document 01.pdf
2025-09-11 04:26:54,215 - INFO - Finished converting document 01.pdf in 33.93 sec.
2025-09-11 04:26:54,225 - INFO - Document converted in 33.95 seconds.


In [23]:
original_document = conv_result.document
filtered_texts = []
labels = []
for element in original_document.texts:
    labels.append(element.label)
    
labels = set(labels)
# filtered_document = DoclingDocument

In [56]:
labels

{<DocItemLabel.CAPTION: 'caption'>,
 <DocItemLabel.FOOTNOTE: 'footnote'>,
 <DocItemLabel.LIST_ITEM: 'list_item'>,
 <DocItemLabel.PAGE_FOOTER: 'page_footer'>,
 <DocItemLabel.PAGE_HEADER: 'page_header'>,
 <DocItemLabel.SECTION_HEADER: 'section_header'>,
 <DocItemLabel.TEXT: 'text'>}

In [75]:
element

TextItem(self_ref='#/texts/961', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, label=<DocItemLabel.PAGE_FOOTER: 'page_footer'>, prov=[ProvenanceItem(page_no=31, bbox=BoundingBox(l=318.161, t=25.647022949218808, r=543.278, b=18.290022949218724, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 60))], orig='Developmental Cell 56 , 1164-1181.e1-e12, April 19, 2021 e12', text='Developmental Cell 56 , 1164-1181.e1-e12, April 19, 2021 e12', formatting=None, hyperlink=None)

In [82]:
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling_core.types.doc import DocItemLabel

# Monkey patch the confidence thresholds
def create_custom_converter(confidence_config):
    # Store original thresholds
    original_thresholds = LayoutPostprocessor.CONFIDENCE_THRESHOLDS.copy()
    
    # Create new thresholds - set very high values for unwanted labels
    custom_thresholds = original_thresholds.copy()
    
    # Set impossible thresholds (>1.0) for labels not in your config
    for label in DocItemLabel:
        if label not in confidence_config:
            custom_thresholds[label] = 2.0  # Impossible threshold
        else:
            custom_thresholds[label] = confidence_config[label]
    
    # Apply the patch
    LayoutPostprocessor.CONFIDENCE_THRESHOLDS = custom_thresholds
    
    # Create converter with these settings
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["en"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=32, device=AcceleratorDevice.AUTO
    )
    
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    
    # Return both converter and restoration function
    def restore_thresholds():
        LayoutPostprocessor.CONFIDENCE_THRESHOLDS = original_thresholds
    
    return converter, restore_thresholds

In [88]:
# Usage
confidence_config = {
    DocItemLabel.SECTION_HEADER: 0.8,
    DocItemLabel.TEXT: 0.95,
    # Only include labels you want to keep
}

# Create custom converter
custom_converter, restore_fn = create_custom_converter(confidence_config)

try:
    # Convert with custom confidence filtering
    conv_result = custom_converter.convert(doc_paths[0])
    
    # Export to markdown (should work without hierarchy errors)
    markdown_content = conv_result.document.export_to_markdown()
    
finally:
    # Always restore original thresholds
    restore_fn()

2025-09-11 06:10:06,468 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-11 06:10:06,675 - INFO - Going to convert document batch...
2025-09-11 06:10:06,676 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 0e1a47abb5af401bdb77d892261e0a3d
2025-09-11 06:10:06,677 - INFO - Accelerator device: 'cuda:0'
2025-09-11 06:10:09,809 - INFO - Accelerator device: 'cuda:0'
2025-09-11 06:10:12,304 - INFO - Accelerator device: 'cuda:0'
2025-09-11 06:10:13,578 - INFO - Processing document 01.pdf
2025-09-11 06:10:49,184 - INFO - Finished converting document 01.pdf in 42.72 sec.


In [87]:
debug_dir = Path("testdata/debug/")
doc_filename = conv_result.input.file.stem
with (debug_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

In [86]:
output_dir / f"{doc_filename}.md"

PosixPath('scratch/01.md')

In [8]:
for doc_path in tqdm(doc_paths):
    start_time = time.time()
    conv_result = doc_converter.convert(doc_path)
    end_time = time.time() - start_time
    _log.info(f"Document converted in {end_time:.2f} seconds.")

    doc_filename = conv_result.input.file.stem
    
    # Export Deep Search document JSON format:
    with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
        fp.write(json.dumps(conv_result.document.export_to_dict()))

    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_markdown())

  0%|          | 0/15 [00:00<?, ?it/s]

2025-09-09 10:31:54,103 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-09 10:31:54,139 - INFO - Going to convert document batch...
2025-09-09 10:31:54,140 - INFO - Processing document 01.pdf
2025-09-09 10:32:17,571 - INFO - Finished converting document 01.pdf in 23.47 sec.
2025-09-09 10:32:17,597 - INFO - Document converted in 23.49 seconds.
2025-09-09 10:32:17,744 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-09 10:32:17,770 - INFO - Going to convert document batch...
2025-09-09 10:32:17,772 - INFO - Processing document 02.pdf
2025-09-09 10:32:41,047 - INFO - Finished converting document 02.pdf in 23.30 sec.
2025-09-09 10:32:41,062 - INFO - Document converted in 23.32 seconds.
2025-09-09 10:32:41,198 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-09 10:32:41,225 - INFO - Going to convert document batch...
2025-09-09 10:32:41,226 - INFO - Processing document 03.pdf
2025-09-09 10:33:08,012 - INFO - Finished converting document 03.pdf in 26.8

In [12]:
## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Text format:
# with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
#     fp.write(conv_result.document.export_to_text())

# Export Markdown format:
# with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
#     fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
# with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
#     fp.write(conv_result.document.export_to_document_tokens())

# 12-09-2025

In [58]:
from pathlib import Path
import re
def extract_results(text):
    """
    This function extracts (abstract + introduction) and results.
    """
    SEP1 = r"\n## results?\n"
    SEP2 = r"\n## discussion\n"

    context, results = re.split(SEP1, text, flags=re.IGNORECASE)
    results = re.split(SEP2, results, flags=re.IGNORECASE)[0]
    return context, results

def extract_subsections(text):
    # \n##          -> A newline, two hashes, and a space.
    # (?!Figure)    -> A "negative lookahead". Asserts that the following text is NOT "Figure".
    #                This is the key part for the exclusion.
    # [^\n]+        -> Matches one or more characters that are NOT a newline (the header text).
    # \n            -> The final newline ending the header line.
    # wrapping the whole pattern in a capturing group (...)
    SEP = r"(\n## (?!Figure)[^\n]+\n)"
    parts = re.split(SEP, text, flags=re.IGNORECASE)
    subsections = [parts[0]]
    for i in range(1, len(parts), 2):
        sep = parts[i]
        chunk = parts[i+1]
        subsections.append(sep + chunk)
    
    subsections = [x.strip() for x in subsections if x.strip()]
    
    return subsections

In [59]:
markdown_dir = Path("./markdown/ocr/")
md_files = sorted(list(markdown_dir.glob("*.md")))

In [60]:
# with open(md_files[7], "r") as f:
    # data = f.read()

data = open(md_files[8], "r").read()

In [56]:
context, results = extract_results(data)
subsections = extract_subsections(results)
print(subsections[2])
# print(results)

## Annotating gene function from transcriptional phenotypes

Previous Perturb-seq screens focused on targeted sets of perturbations, such as genes identified in forward genetic screens. Our screen targeting all expressed genes in K562 cells presented an opportunity to assess how well transcriptional phenotypes can resolve gene function when used in an unbiased manner.

We focused on a subset of 1,973 perturbations that had strong transcriptional phenotypes (Figure 2A). Because related perturbations could have different magnitudes of effect, we used the correlation between mean expression profiles as a scale-invariant metric of similarity. To assess the extent to which correlated expression profiles between genetic pertur- bations indicated common function, we compared our results with two curated sources of biological relationships. First, among the 1,973 targeted genes, there were 327 protein complexes from CORUM3.0 with at least two thirds of the complex members present, representing