# Output of the docling document.

## Import statements and first conversion

In [3]:
from notebook_utils import setup_notebook_environment, style_dataframe
setup_notebook_environment()

from docling.chunking import HierarchicalChunker, HybridChunker
from docling.document_converter import DocumentConverter
from app.utils.tokenizer import OpenAITokenizerWrapper
from app.config.settings import settings
from pathlib import Path

from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
# from app.services.document_service import DocumentService

Notebook environment setup complete


In [2]:
converted_docs_path = settings.file_paths.get_converted_docs_path()
converted_docs_path

PosixPath('/home/sng/nanobot-poc/data/converted_docs')

In [5]:
IMAGE_RESOLUTION_SCALE = 2.0

In [6]:
input_doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'
output_dir = Path(converted_docs_path)

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
# with the image field

IMAGE_RESOLUTION_SCALE = 2.0
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

conv_res = doc_converter.convert(input_doc_path)

In [7]:
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = 'test_sop'

In [8]:
# Save page images
for page_no, page in conv_res.document.pages.items():
    page_no = page.page_no
    page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
    with page_image_filename.open("wb") as fp:
        page.image.pil_image.save(fp, format="PNG")

# Save images of figures and tables
table_counter = 0
picture_counter = 0
for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TableItem):
        table_counter += 1
        element_image_filename = (
            output_dir / f"{doc_filename}-table-{table_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(conv_res.document).save(fp, "PNG")

    if isinstance(element, PictureItem):
        picture_counter += 1
        element_image_filename = (
            output_dir / f"{doc_filename}-picture-{picture_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(conv_res.document).save(fp, "PNG")

In [9]:
# Save markdown with embedded pictures
md_filename = output_dir / f"{doc_filename}-with-images.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# Save markdown with externally referenced pictures
md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

# Save HTML with externally referenced pictures
html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    # Save HTML with externally referenced pictures
    html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
    conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

In [4]:


# Initialize pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0  # Adjust resolution as needed
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

# Create the document converter
converter = DocumentConverter()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'

# Convert the document
document = converter.convert(
    doc_path,
    input_format=InputFormat.PDF,
    pipeline_options=pipeline_options
)


ValidationError: 2 validation errors for DocumentConverter.convert
input_format
  Unexpected keyword argument [type=unexpected_keyword_argument, input_value=<InputFormat.PDF: 'pdf'>, input_type=InputFormat]
    For further information visit https://errors.pydantic.dev/2.10/v/unexpected_keyword_argument
pipeline_options
  Unexpected keyword argument [type=unexpected_keyword_argument, input_value=PdfPipelineOptions(create...rate_table_images=False), input_type=PdfPipelineOptions]
    For further information visit https://errors.pydantic.dev/2.10/v/unexpected_keyword_argument

In [None]:


# Initialize pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0  # Adjust resolution as needed
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

# Create the document converter
converter = DocumentConverter()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'

# Convert the document
document = converter.convert(
    doc_path,
    input_format=InputFormat.PDF,
    pipeline_options=pipeline_options
)

# Get the base output directory
converted_docs_path = settings.file_paths.get_converted_docs_path()
base_name = "pdf_document"  # or whatever name you want
output_dir = converted_docs_path / base_name
output_dir.mkdir(parents=True, exist_ok=True)

# Save markdown with referenced images
md_path = output_dir / f"{base_name}.md"
document.save_as_markdown(md_path, image_mode=ImageRefMode.REFERENCED)

# Save images and tables
image_counter = 0
table_counter = 0

# Iterate through all elements in the document
for element, _level in document.iterate_items():
    if isinstance(element, PictureItem):
        image_counter += 1
        # Save image
        image_path = output_dir / f"image_{image_counter}.png"
        image = element.get_image(document)
        if image is not None:
            with image_path.open("wb") as fp:
                image.save(fp, "PNG")
            print(f"Saved image to: {image_path}")
        else:
            print(f"Warning: Could not get image for element {image_counter}")
        
    elif isinstance(element, TableItem):
        table_counter += 1
        # Save table as image
        table_path = output_dir / f"table_{table_counter}.png"
        table_image = element.get_image(document)
        if table_image is not None:
            with table_path.open("wb") as fp:
                table_image.save(fp, "PNG")
            print(f"Saved table to: {table_path}")
        else:
            print(f"Warning: Could not get table image for element {table_counter}")

print(f"Saved markdown to: {md_path}")
print(f"Total images saved: {image_counter}")
print(f"Total tables saved: {table_counter}")

In [None]:
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from app.config.settings import settings

# Initialize pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0  # Adjust resolution as needed
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

# Create the document converter
converter = DocumentConverter()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'

# Convert the document
document = converter.convert(
    doc_path,
    input_format=InputFormat.PDF,
    pipeline_options=pipeline_options
)

# Get the base output directory
converted_docs_path = settings.file_paths.get_converted_docs_path()
base_name = "pdf_document"  # or whatever name you want
output_dir = converted_docs_path / base_name
output_dir.mkdir(parents=True, exist_ok=True)

# Save markdown with referenced images
md_path = output_dir / f"{base_name}.md"
document.save_as_markdown(md_path, image_mode=ImageRefMode.REFERENCED)

# Save images and tables
image_counter = 0
table_counter = 0

# Iterate through all elements in the document
for element, _level in document.iterate_items():
    if isinstance(element, PictureItem):
        image_counter += 1
        # Save image
        image_path = output_dir / f"image_{image_counter}.png"
        image = element.get_image(document)
        if image is not None:
            with image_path.open("wb") as fp:
                image.save(fp, "PNG")
            print(f"Saved image to: {image_path}")
        else:
            print(f"Warning: Could not get image for element {image_counter}")
        
    elif isinstance(element, TableItem):
        table_counter += 1
        # Save table as image
        table_path = output_dir / f"table_{table_counter}.png"
        table_image = element.get_image(document)
        if table_image is not None:
            with table_path.open("wb") as fp:
                table_image.save(fp, "PNG")
            print(f"Saved table to: {table_path}")
        else:
            print(f"Warning: Could not get table image for element {table_counter}")

print(f"Saved markdown to: {md_path}")
print(f"Total images saved: {image_counter}")
print(f"Total tables saved: {table_counter}")

In [None]:
converter = DocumentService()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'
result = converter.convert_url(doc_path)
document = result.document

In [2]:
converter = DocumentConverter()
doc_path = "/home/sng/nanobot-poc/data/test/grant_decision_email_single_page.pdf"
result = converter.convert(doc_path)
document = result.document

In [4]:
md_data = document.export_to_markdown()

In [None]:
filename = "pdf_document.md"  # or whatever name you want

# Create the full path
output_path = converted_docs_path / filename

# Save the markdown data
with open(output_path, "w", encoding="utf-8") as f:
    f.write(md_data)

print(f"Saved markdown to: {output_path}")

In [None]:
converter = DocumentService()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'
result = converter.convert_document(doc_path)
document = result.document
md_data = document.export_to_markdown()
converted_docs_path = settings.file_paths.get_converted_docs_path()
filename = "pdf_document.md"  # or whatever name you want
# Create the full path
output_path = converted_docs_path / filename
# Save the markdown data
with open(output_path, "w", encoding="utf-8") as f:
    f.write(md_data)
print(f"Saved markdown to: {output_path}")

In [None]:
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from app.config.settings import settings

# Initialize pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2.0  # Adjust resolution as needed
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

# Create the document converter
converter = DocumentConverter()
doc_path = 'https://docs.google.com/document/d/1YGpHp7avHQMojJRbkAyP8io0ZqarRvMKm_ulQ6dYq0g/export?format=pdf'

# Convert the document
document = converter.convert(
    doc_path,
    input_format=InputFormat.PDF,
    pipeline_options=pipeline_options
)

# Get the base output directory
converted_docs_path = settings.file_paths.get_converted_docs_path()
base_name = "pdf_document"  # or whatever name you want
output_dir = converted_docs_path / base_name
output_dir.mkdir(parents=True, exist_ok=True)

# Save markdown with referenced images
md_path = output_dir / f"{base_name}.md"
document.save_as_markdown(md_path, image_mode=ImageRefMode.REFERENCED)

# Save images and tables
image_counter = 0
table_counter = 0

# Iterate through all elements in the document
for element, _level in document.iterate_items():
    if isinstance(element, PictureItem):
        image_counter += 1
        # Save image
        image_path = output_dir / f"image_{image_counter}.png"
        image = element.get_image(document)
        if image is not None:
            with image_path.open("wb") as fp:
                image.save(fp, "PNG")
            print(f"Saved image to: {image_path}")
        else:
            print(f"Warning: Could not get image for element {image_counter}")
        
    elif isinstance(element, TableItem):
        table_counter += 1
        # Save table as image
        table_path = output_dir / f"table_{table_counter}.png"
        table_image = element.get_image(document)
        if table_image is not None:
            with table_path.open("wb") as fp:
                table_image.save(fp, "PNG")
            print(f"Saved table to: {table_path}")
        else:
            print(f"Warning: Could not get table image for element {table_counter}")

print(f"Saved markdown to: {md_path}")
print(f"Total images saved: {image_counter}")
print(f"Total tables saved: {table_counter}")

In [None]:
document.export_to_markdown()

In [3]:
tokenizer = OpenAITokenizerWrapper()

## Exploring Document Properties

### Document Attributes

In [None]:
print("Document attributes (data):")
for attr in dir(document):
    if not attr.startswith('_'):  # Skip private attributes
        try:
            value = getattr(document, attr)
            if not callable(value):  # Only non-callable (attributes)
                print(f"- {attr}: {value}")
        except Exception as e:
            print(f"- {attr}: Error accessing ({str(e)})")


### Document Methods

In [None]:
print("\nDocument methods (functions):")
for attr in dir(document):
    if not attr.startswith('_'):  # Skip private attributes
        try:
            value = getattr(document, attr)
            if callable(value):  # Only callable (methods)
                print(f"- {attr}()")
        except Exception as e:
            print(f"- {attr}: Error accessing ({str(e)})")

## Exploring Text Elements

In [None]:
print("\n=== TEXT ELEMENTS STRUCTURE ===")
for i, text_item in enumerate(document.texts):
    print(f"\nText Element #{i+1}:")
    
    # Get the type of the text element
    element_type = type(text_item).__name__
    print(f"Type: {element_type}")
    
    # Print text content with length information
    text_length = len(text_item.text)
    token_count = tokenizer.count_tokens(text_item.text)
    print(f"Text ({text_length} chars, {token_count} tokens): {text_item.text[:100]}..." if text_length > 100 
          else f"Text ({text_length} chars, {token_count} tokens): {text_item.text}")
    
    # Print other attributes based on the element type
    if hasattr(text_item, 'label'):
        print(f"Label: {text_item.label}")
    
    if hasattr(text_item, 'level') and element_type == 'SectionHeaderItem':
        print(f"Heading Level: {text_item.level}")
    
    if hasattr(text_item, 'prov') and text_item.prov:
        page_numbers = [prov.page_no for prov in text_item.prov]
        print(f"Page Numbers: {page_numbers}")
        
        # Get bounding box information if available
        if hasattr(text_item.prov[0], 'bbox'):
            bbox = text_item.prov[0].bbox
            print(f"Bounding Box: left={bbox.l}, top={bbox.t}, right={bbox.r}, bottom={bbox.b}")
    

In [None]:
# Count the different types of elements
element_types = {}
for text_item in document.texts:
    element_type = type(text_item).__name__
    element_types[element_type] = element_types.get(element_type, 0) + 1

print("\n=== ELEMENT TYPE COUNTS ===")
for element_type, count in element_types.items():
    print(f"{element_type}: {count}")

In [None]:
list_items = [item for item in document.texts if type(item).__name__ == 'ListItem']
print(f"\nNumber of list items found: {len(list_items)}")

In [None]:
full_text = ""
for text_item in document.texts:
    full_text += text_item.text + " "
print(f"Total text length: {len(full_text)} characters")
print(f"\nFirst 200 chars of text: {full_text[:200]}...")

In [None]:
# Check the type annotation for document.texts
import inspect
from typing import get_type_hints

# This might work depending on how Docling is implemented
type_hints = get_type_hints(type(document))
if 'texts' in type_hints:
    print(f"Type hint for texts: {type_hints['texts']}")

In [None]:
type_hints = get_type_hints(type(document))
if 'texts' in type_hints:
    type_hint_str = str(type_hints['texts'])
    print(f"Full type hint: {type_hint_str}")
    
    # Extract just the class names
    if 'Union[' in type_hint_str:
        # Extract the part between Union[ and ]
        union_content = type_hint_str.split('Union[')[1].split(']')[0]
        
        # Split by comma and extract class names
        class_paths = [path.strip() for path in union_content.split(',')]
        class_names = [path.split('.')[-1] for path in class_paths]
        
        print("\nText element types:")
        for class_name in class_names:
            print(f"- {class_name}")

In [None]:
# Cell for examining text element attributes
print("\n=== TEXT ELEMENT ATTRIBUTES ===")
for i, text_item in enumerate(document.texts):
    print(f"\nText Element #{i+1}:")
    element_type = type(text_item).__name__
    print(f"Type: {element_type}")
    
    # Show text preview
    text_preview = text_item.text[:50] + "..." if len(text_item.text) > 50 else text_item.text
    print(f"Text: {text_preview}")
    
    # Show list-specific attributes
    if element_type == 'ListItem':
        if hasattr(text_item, 'list_type'):
            print(f"List Type: {text_item.list_type}")
        if hasattr(text_item, 'list_index'):
            print(f"List Index: {text_item.list_index}")
    
    # Show all other non-private attributes
    print("Other attributes:")
    for attr in dir(text_item):
        if not attr.startswith('_') and attr not in ['text', 'label', 'level', 'prov', 'list_type', 'list_index']:
            try:
                value = getattr(text_item, attr)
                if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                    print(f"  - {attr}: {value}")
            except Exception:
                pass

## Exploring Furniture

In [None]:
# Inspect document furniture
print("\n=== DOCUMENT FURNITURE ===")

# Check if furniture exists
if hasattr(document, 'furniture'):
    furniture = document.furniture
    
    # Print basic furniture information
    print(f"Furniture object type: {type(furniture).__name__}")
    
    # Check if furniture has any children
    if hasattr(furniture, 'children') and furniture.children:
        print(f"Number of furniture children: {len(furniture.children)}")
        
        # Inspect each furniture child
        for i, child in enumerate(furniture.children):
            print(f"\nFurniture Child #{i+1}:")
            child_type = type(child).__name__
            print(f"Type: {child_type}")
            
            # Try to access common attributes
            for attr in ['text', 'label', 'name', 'content_layer']:
                if hasattr(child, attr):
                    value = getattr(child, attr)
                    print(f"{attr}: {value}")
            
            # Check for position information
            if hasattr(child, 'prov') and child.prov:
                page_numbers = [prov.page_no for prov in child.prov]
                print(f"Page Numbers: {page_numbers}")
                
                # Get bounding box information if available
                if hasattr(child.prov[0], 'bbox'):
                    bbox = child.prov[0].bbox
                    print(f"Bounding Box: left={bbox.l}, top={bbox.t}, right={bbox.r}, bottom={bbox.b}")
            
            # Show all other attributes
            print("Other attributes:")
            for attr in dir(child):
                if not attr.startswith('_') and attr not in ['text', 'label', 'name', 'content_layer', 'prov', 'children']:
                    try:
                        value = getattr(child, attr)
                        if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                            print(f"  - {attr}: {value}")
                    except Exception:
                        pass
    else:
        print("Furniture has no children.")
    
    # Show all furniture attributes
    print("\nFurniture attributes:")
    for attr in dir(furniture):
        if not attr.startswith('_') and attr not in ['children']:
            try:
                value = getattr(furniture, attr)
                if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                    print(f"- {attr}: {value}")
            except Exception as e:
                print(f"- {attr}: Error accessing ({str(e)})")
else:
    print("Document does not have a furniture attribute.")

## Exploring Chunking

In [None]:
print("\n=== UNDERSTANDING DOCLING CHUNKING ===")

# 1. Import the basic chunkers
from docling.chunking import BaseChunker, HierarchicalChunker, HybridChunker

# 2. Check what a chunker returns
print("Chunkers in Docling return DocChunk objects that contain:")
print("- text: The actual text content of the chunk")
print("- meta: Metadata about the chunk (headings, page numbers, etc.)")
print("- Chunks are returned as iterators, so we typically convert to list")

# 3. Let's examine a single chunk to understand its structure
chunker = HierarchicalChunker()  # Start with the hierarchical chunker
chunks = list(chunker.chunk(document))

if chunks:
    # Examine the first chunk in detail
    first_chunk = chunks[0]
    print("\nExample chunk structure:")
    print(f"- Type: {type(first_chunk).__name__}")
    print(f"- Text length: {len(first_chunk.text)} characters")
    print(f"- Text preview: {first_chunk.text[:100]}...")
    
    # Examine the metadata
    print("\nChunk metadata contains:")
    for attr in dir(first_chunk.meta):
        if not attr.startswith('_') and not callable(getattr(first_chunk.meta, attr)):
            value = getattr(first_chunk.meta, attr)
            if isinstance(value, (list, dict)):
                print(f"- {attr}: {type(value).__name__} with {len(value)} items")
            else:
                print(f"- {attr}: {value}")

In [None]:
# Compare different chunkers and their output
print("\n=== COMPARING DIFFERENT CHUNKERS ===")

# 1. HierarchicalChunker
hierarchical_chunker = HierarchicalChunker()
hierarchical_chunks = list(hierarchical_chunker.chunk(document))
print(f"HierarchicalChunker produced {len(hierarchical_chunks)} chunks")

# 2. HybridChunker with default settings
hybrid_chunker = HybridChunker(tokenizer=tokenizer)
hybrid_chunks = list(hybrid_chunker.chunk(document))
print(f"HybridChunker (default) produced {len(hybrid_chunks)} chunks")

# 3. HybridChunker with custom max_tokens
hybrid_chunker_custom = HybridChunker(tokenizer=tokenizer, max_tokens=1000)
hybrid_custom_chunks = list(hybrid_chunker_custom.chunk(document))
print(f"HybridChunker (max_tokens=1000) produced {len(hybrid_custom_chunks)} chunks")

# Print a summary of each chunker's output
print("\nChunker comparison summary:")
print(f"1. HierarchicalChunker: {len(hierarchical_chunks)} chunks")
print(f"   - First chunk length: {len(hierarchical_chunks[0].text)} chars")
print(f"   - First chunk headings: {hierarchical_chunks[0].meta.headings}")

print(f"\n2. HybridChunker (default): {len(hybrid_chunks)} chunks")
print(f"   - First chunk length: {len(hybrid_chunks[0].text)} chars")
print(f"   - First chunk headings: {hybrid_chunks[0].meta.headings}")

print(f"\n3. HybridChunker (max_tokens=100): {len(hybrid_custom_chunks)} chunks")
print(f"   - First chunk length: {len(hybrid_custom_chunks[0].text)} chars")
print(f"   - First chunk headings: {hybrid_custom_chunks[0].meta.headings}")

In [None]:
# Investigate why HybridChunker is creating only one chunk
print("\n=== INVESTIGATING HYBRIDCHUNKER BEHAVIOR ===")

# 1. Check the total token count of the document
full_text = ""
for text_item in document.texts:
    full_text += text_item.text + " "
total_tokens = tokenizer.count_tokens(full_text)
print(f"Total document text: {len(full_text)} characters, {total_tokens} tokens")

# 2. Check if the document is small enough to fit in one chunk
print(f"Default max_tokens for HybridChunker: 2048")
print(f"Is document smaller than default max_tokens? {total_tokens < 2048}")

# 3. Try with explicit parameters and debug output
print("\n=== TRYING HYBRIDCHUNKER WITH EXPLICIT PARAMETERS ===")
hybrid_chunker_debug = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=100,  # Very small limit
    merge_peers=False,  # Don't merge sections
    min_chunk_chars=10,  # Allow very small chunks
    min_chunk_size_ratio=0.1  # Allow chunks as small as 10% of max_tokens
)

# Get chunks and examine them
debug_chunks = list(hybrid_chunker_debug.chunk(document))
print(f"HybridChunker with restrictive parameters produced {len(debug_chunks)} chunks")

# Check the token count of the single chunk if there's only one
if len(debug_chunks) == 1:
    chunk_tokens = tokenizer.count_tokens(debug_chunks[0].text)
    print(f"Single chunk token count: {chunk_tokens}")
    print(f"Exceeds max_tokens limit of 100? {chunk_tokens > 100}")
    
    # Check if the document has a structure that prevents splitting
    print("\nDocument structure analysis:")
    print(f"Number of text elements: {len(document.texts)}")
    print(f"Number of section headers: {sum(1 for item in document.texts if type(item).__name__ == 'SectionHeaderItem')}")
    
    # Try to understand why it's not splitting
    print("\nPossible reasons for not splitting:")
    print("1. Document might have a structure that HybridChunker considers atomic")
    print("2. There might be a minimum chunk size that prevents splitting")
    print("3. The chunker might be configured to keep certain elements together")

In [None]:
# Examine the chunks produced by the restrictive HybridChunker
print("\n=== EXAMINING HYBRIDCHUNKER CHUNKS ===")

hybrid_chunker_detailed = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=100,  # Small token limit
    merge_peers=False,  # Don't merge sections
    min_chunk_chars=10,  # Allow small chunks
    min_chunk_size_ratio=0.1  # Allow chunks as small as 10% of max_tokens
)

detailed_chunks = list(hybrid_chunker_detailed.chunk(document))
print(f"Number of chunks: {len(detailed_chunks)}")

# Examine each chunk
for i, chunk in enumerate(detailed_chunks):
    token_count = tokenizer.count_tokens(chunk.text)
    print(f"\nChunk {i+1}:")
    print(f"- Length: {len(chunk.text)} chars, {token_count} tokens")
    print(f"- Headings: {chunk.meta.headings}")
    
    # Get the first few words to understand the content
    preview = chunk.text[:50] + "..." if len(chunk.text) > 50 else chunk.text
    print(f"- Preview: {preview}")
    
    # Check if the chunk respects the token limit
    if token_count > 100:
        print("  WARNING: Chunk exceeds the max_tokens limit of 100")
        
        # Try to understand why
        doc_items = chunk.meta.doc_items if hasattr(chunk.meta, 'doc_items') else []
        item_types = [type(item).__name__ for item in doc_items]
        print(f"  Contains item types: {set(item_types)}")
        
        # Check if it's a single large text item that can't be split
        if len(doc_items) == 1:
            print(f"  Single item chunk - cannot be split further")