In [9]:
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions


In [10]:

source = r"sample.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
# print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"


In [3]:
with open('sample.md', 'w', encoding='utf-8') as f:
    f.write(result.document.export_to_markdown())

In [4]:
from collections import defaultdict

element_types = defaultdict(list)

# Iterate through all document elements and group them by label
for item, _ in result.document.iterate_items():
    element_type = item.label
    element_types[element_type].append(item)

# Display the breakdown of document structure
print("Document structure breakdown:")
for element_type, items in element_types.items():
    print(f"  {element_type}: {len(items)} elements")

Document structure breakdown:
  picture: 1 elements
  section_header: 4 elements
  text: 15 elements
  list_item: 14 elements


In [5]:
doc = result.document # DoclingDocument

In [6]:
json_dict = doc.export_to_dict()

json_dict.keys()

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'form_items', 'pages'])

In [7]:
# Optimized for large documents
pipeline_options = PdfPipelineOptions(
    max_num_pages=4,  # Limit processing to first 4 pages
    page_range=[1, 3],  # Process specific page range
    generate_page_images=False,  # Skip page images to save memory
    do_table_structure=False,  # Skip table structure extraction
    enable_parallel_processing=True  # Use multiple cores
)

In [13]:
from docling.chunking import HybridChunker

# Process with HybridChunker (token-aware)
hybrid_chunker = HybridChunker(max_tokens=512, overlap_tokens=50)
hybrid_chunks = list(hybrid_chunker.chunk(doc))

print(f"HybridChunker: {len(hybrid_chunks)} chunks")

def print_chunk(chunk):
    print(f"Chunk length: {len(chunk.text)} characters")
    if len(chunk.text) > 30:
        print(f"Chunk content: {chunk.text[:30]}...{chunk.text[-30:]}")
    else:
        print(f"Chunk content: {chunk.text}")
    print("-" * 50)

# Print the first 3 chunks
for chunk in hybrid_chunks[:5]:
    print_chunk(chunk)


Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


HybridChunker: 6 chunks
Chunk length: 831 characters
Chunk content: TORONTO, July 15, 2025 /CNW/ I....isoenergy.ca/sustainability/.
--------------------------------------------------
Chunk length: 2241 characters
Chunk content: - Environmental achievements i...sustainability policy in 2025.
--------------------------------------------------
Chunk length: 876 characters
Chunk content: IsoEnergy (NYSE American: ISOU...@IsoEnergyLtd www.isoenergy.ca
--------------------------------------------------
Chunk length: 956 characters
Chunk content: This press release contains "f...ll or may occur in the future.
--------------------------------------------------
Chunk length: 1443 characters
Chunk content: Forw ard-looking statements ar...n forw ard-looking statements.
--------------------------------------------------
