# Output of the docling document.

## Import statements and first conversion

In [None]:
from docling.chunking import HierarchicalChunker, HybridChunker
from docling.document_converter import DocumentConverter
from app.utils.tokenizer import OpenAITokenizerWrapper

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
converter = DocumentConverter()
doc_path = "/home/sng/nanobot-poc/data/test/grant_decision_email_single_page.pdf"
result = converter.convert(doc_path)
document = result.document

In [3]:
tokenizer = OpenAITokenizerWrapper()

## Exploring Document Properties

### Document Attributes

In [4]:
print("Document attributes (data):")
for attr in dir(document):
    if not attr.startswith('_'):  # Skip private attributes
        try:
            value = getattr(document, attr)
            if not callable(value):  # Only non-callable (attributes)
                print(f"- {attr}: {value}")
        except Exception as e:
            print(f"- {attr}: Error accessing ({str(e)})")


Document attributes (data):
- body: self_ref='#/body' parent=None children=[RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref='#/texts/12'), RefItem(cref='#/texts/13'), RefItem(cref='#/texts/14')] content_layer=<ContentLayer.BODY: 'body'> name='_root_' label=<GroupLabel.UNSPECIFIED: 'unspecified'>
- furniture: self_ref='#/furniture' parent=None children=[] content_layer=<ContentLayer.FURNITURE: 'furniture'> name='_root_' label=<GroupLabel.UNSPECIFIED: 'unspecified'>
- groups: [GroupItem(self_ref='#/groups/0', parent=RefItem(cref='#/body'), children=[RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4')], content_layer=<ContentLayer.BODY: 'body'>, name='group', label=<GroupLabel.KEY_VALUE_AREA: 'key_value_area'>)]
- key_value_ite

  value = getattr(document, attr)


### Document Methods

In [5]:
print("\nDocument methods (functions):")
for attr in dir(document):
    if not attr.startswith('_'):  # Skip private attributes
        try:
            value = getattr(document, attr)
            if callable(value):  # Only callable (methods)
                print(f"- {attr}()")
        except Exception as e:
            print(f"- {attr}: Error accessing ({str(e)})")


Document methods (functions):
- add_code()
- add_group()
- add_heading()
- add_list_item()
- add_page()
- add_picture()
- add_table()
- add_text()
- add_title()
- check_version_is_compatible()
- construct()
- copy()
- dict()
- export_to_dict()
- export_to_document_tokens()
- export_to_element_tree()
- export_to_html()
- export_to_markdown()
- export_to_text()
- from_orm()
- iterate_items()
- json()
- load_from_json()
- model_construct()
- model_copy()
- model_dump()
- model_dump_json()
- model_json_schema()
- model_parametrized_name()
- model_post_init()
- model_rebuild()
- model_validate()
- model_validate_json()
- model_validate_strings()
- num_pages()
- parse_file()
- parse_obj()
- parse_raw()
- print_element_tree()
- save_as_document_tokens()
- save_as_html()
- save_as_json()
- save_as_markdown()
- save_as_yaml()
- schema()
- schema_json()
- transform_to_content_layer()
- update_forward_refs()
- validate()
- validate_document()
- validate_tree()


  value = getattr(document, attr)


## Exploring Text Elements

In [6]:
print("\n=== TEXT ELEMENTS STRUCTURE ===")
for i, text_item in enumerate(document.texts):
    print(f"\nText Element #{i+1}:")
    
    # Get the type of the text element
    element_type = type(text_item).__name__
    print(f"Type: {element_type}")
    
    # Print text content with length information
    text_length = len(text_item.text)
    token_count = tokenizer.count_tokens(text_item.text)
    print(f"Text ({text_length} chars, {token_count} tokens): {text_item.text[:100]}..." if text_length > 100 
          else f"Text ({text_length} chars, {token_count} tokens): {text_item.text}")
    
    # Print other attributes based on the element type
    if hasattr(text_item, 'label'):
        print(f"Label: {text_item.label}")
    
    if hasattr(text_item, 'level') and element_type == 'SectionHeaderItem':
        print(f"Heading Level: {text_item.level}")
    
    if hasattr(text_item, 'prov') and text_item.prov:
        page_numbers = [prov.page_no for prov in text_item.prov]
        print(f"Page Numbers: {page_numbers}")
        
        # Get bounding box information if available
        if hasattr(text_item.prov[0], 'bbox'):
            bbox = text_item.prov[0].bbox
            print(f"Bounding Box: left={bbox.l}, top={bbox.t}, right={bbox.r}, bottom={bbox.b}")
    


=== TEXT ELEMENTS STRUCTURE ===

Text Element #1:
Type: SectionHeaderItem
Text (61 chars, 13 tokens): FuzeHub Manufacturing Grant 2024 Round 2 Application Decision
Label: section_header
Heading Level: 1
Page Numbers: [1]
Bounding Box: left=54.0, top=743.927001953125, right=436.2510070800781, bottom=737.0759887695312

Text Element #2:
Type: SectionHeaderItem
Text (31 chars, 13 tokens): FuzeHub Fund <fund@fuzehub.com>
Label: section_header
Heading Level: 1
Page Numbers: [1]
Bounding Box: left=54.0, top=718.6519775390625, right=254.47900390625, bottom=712.2039794921875

Text Element #3:
Type: TextItem
Text (22 chars, 13 tokens): Wed 2024-07-24 9:26 AM
Label: text
Page Numbers: [1]
Bounding Box: left=54.0, top=701.552001953125, right=155.86099243164062, bottom=696.7160034179688

Text Element #4:
Type: TextItem
Text (44 chars, 17 tokens): To:  Samantha Roberts <sroberts@gc.cuny.edu>
Label: text
Page Numbers: [1]
Bounding Box: left=54.0, top=685.802, right=242.641, bottom=680.966

Text Elem

In [7]:
# Count the different types of elements
element_types = {}
for text_item in document.texts:
    element_type = type(text_item).__name__
    element_types[element_type] = element_types.get(element_type, 0) + 1

print("\n=== ELEMENT TYPE COUNTS ===")
for element_type, count in element_types.items():
    print(f"{element_type}: {count}")


=== ELEMENT TYPE COUNTS ===
SectionHeaderItem: 2
TextItem: 13


In [8]:
list_items = [item for item in document.texts if type(item).__name__ == 'ListItem']
print(f"\nNumber of list items found: {len(list_items)}")


Number of list items found: 0


In [9]:
full_text = ""
for text_item in document.texts:
    full_text += text_item.text + " "
print(f"Total text length: {len(full_text)} characters")
print(f"\nFirst 200 chars of text: {full_text[:200]}...")

Total text length: 2087 characters

First 200 chars of text: FuzeHub Manufacturing Grant 2024 Round 2 Application Decision FuzeHub Fund <fund@fuzehub.com> Wed 2024-07-24 9:26 AM To:  Samantha Roberts <sroberts@gc.cuny.edu> Cc:  Yuki Chen <xchen4@gc.cuny.edu>;  ...


In [10]:
# Check the type annotation for document.texts
import inspect
from typing import get_type_hints

# This might work depending on how Docling is implemented
type_hints = get_type_hints(type(document))
if 'texts' in type_hints:
    print(f"Type hint for texts: {type_hints['texts']}")

Type hint for texts: typing.List[typing.Union[docling_core.types.doc.document.SectionHeaderItem, docling_core.types.doc.document.ListItem, docling_core.types.doc.document.TextItem, docling_core.types.doc.document.CodeItem]]


In [11]:
type_hints = get_type_hints(type(document))
if 'texts' in type_hints:
    type_hint_str = str(type_hints['texts'])
    print(f"Full type hint: {type_hint_str}")
    
    # Extract just the class names
    if 'Union[' in type_hint_str:
        # Extract the part between Union[ and ]
        union_content = type_hint_str.split('Union[')[1].split(']')[0]
        
        # Split by comma and extract class names
        class_paths = [path.strip() for path in union_content.split(',')]
        class_names = [path.split('.')[-1] for path in class_paths]
        
        print("\nText element types:")
        for class_name in class_names:
            print(f"- {class_name}")

Full type hint: typing.List[typing.Union[docling_core.types.doc.document.SectionHeaderItem, docling_core.types.doc.document.ListItem, docling_core.types.doc.document.TextItem, docling_core.types.doc.document.CodeItem]]

Text element types:
- SectionHeaderItem
- ListItem
- TextItem
- CodeItem


In [12]:
# Cell for examining text element attributes
print("\n=== TEXT ELEMENT ATTRIBUTES ===")
for i, text_item in enumerate(document.texts):
    print(f"\nText Element #{i+1}:")
    element_type = type(text_item).__name__
    print(f"Type: {element_type}")
    
    # Show text preview
    text_preview = text_item.text[:50] + "..." if len(text_item.text) > 50 else text_item.text
    print(f"Text: {text_preview}")
    
    # Show list-specific attributes
    if element_type == 'ListItem':
        if hasattr(text_item, 'list_type'):
            print(f"List Type: {text_item.list_type}")
        if hasattr(text_item, 'list_index'):
            print(f"List Index: {text_item.list_index}")
    
    # Show all other non-private attributes
    print("Other attributes:")
    for attr in dir(text_item):
        if not attr.startswith('_') and attr not in ['text', 'label', 'level', 'prov', 'list_type', 'list_index']:
            try:
                value = getattr(text_item, attr)
                if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                    print(f"  - {attr}: {value}")
            except Exception:
                pass


=== TEXT ELEMENT ATTRIBUTES ===

Text Element #1:
Type: SectionHeaderItem
Text: FuzeHub Manufacturing Grant 2024 Round 2 Applicati...
Other attributes:
  - content_layer: ContentLayer.BODY
  - model_extra: None
  - model_fields_set: {'level', 'orig', 'self_ref', 'parent', 'text'}
  - orig: FuzeHub Manufacturing Grant 2024 Round 2 Application Decision
  - parent: cref='#/body'
  - self_ref: #/texts/0

Text Element #2:
Type: SectionHeaderItem
Text: FuzeHub Fund <fund@fuzehub.com>
Other attributes:
  - content_layer: ContentLayer.BODY
  - model_extra: None
  - model_fields_set: {'level', 'orig', 'self_ref', 'parent', 'text'}
  - orig: FuzeHub Fund <fund@fuzehub.com>
  - parent: cref='#/body'
  - self_ref: #/texts/1

Text Element #3:
Type: TextItem
Text: Wed 2024-07-24 9:26 AM
Other attributes:
  - content_layer: ContentLayer.BODY
  - model_extra: None
  - model_fields_set: {'orig', 'label', 'self_ref', 'parent', 'text'}
  - orig: Wed 2024-07-24 9:26 AM
  - parent: cref='#/body'
  - self_

## Exploring Furniture

In [13]:
# Inspect document furniture
print("\n=== DOCUMENT FURNITURE ===")

# Check if furniture exists
if hasattr(document, 'furniture'):
    furniture = document.furniture
    
    # Print basic furniture information
    print(f"Furniture object type: {type(furniture).__name__}")
    
    # Check if furniture has any children
    if hasattr(furniture, 'children') and furniture.children:
        print(f"Number of furniture children: {len(furniture.children)}")
        
        # Inspect each furniture child
        for i, child in enumerate(furniture.children):
            print(f"\nFurniture Child #{i+1}:")
            child_type = type(child).__name__
            print(f"Type: {child_type}")
            
            # Try to access common attributes
            for attr in ['text', 'label', 'name', 'content_layer']:
                if hasattr(child, attr):
                    value = getattr(child, attr)
                    print(f"{attr}: {value}")
            
            # Check for position information
            if hasattr(child, 'prov') and child.prov:
                page_numbers = [prov.page_no for prov in child.prov]
                print(f"Page Numbers: {page_numbers}")
                
                # Get bounding box information if available
                if hasattr(child.prov[0], 'bbox'):
                    bbox = child.prov[0].bbox
                    print(f"Bounding Box: left={bbox.l}, top={bbox.t}, right={bbox.r}, bottom={bbox.b}")
            
            # Show all other attributes
            print("Other attributes:")
            for attr in dir(child):
                if not attr.startswith('_') and attr not in ['text', 'label', 'name', 'content_layer', 'prov', 'children']:
                    try:
                        value = getattr(child, attr)
                        if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                            print(f"  - {attr}: {value}")
                    except Exception:
                        pass
    else:
        print("Furniture has no children.")
    
    # Show all furniture attributes
    print("\nFurniture attributes:")
    for attr in dir(furniture):
        if not attr.startswith('_') and attr not in ['children']:
            try:
                value = getattr(furniture, attr)
                if not callable(value) and not isinstance(value, (list, dict)) and str(value) != '':
                    print(f"- {attr}: {value}")
            except Exception as e:
                print(f"- {attr}: Error accessing ({str(e)})")
else:
    print("Document does not have a furniture attribute.")


=== DOCUMENT FURNITURE ===
Furniture object type: GroupItem
Furniture has no children.

Furniture attributes:
- content_layer: ContentLayer.FURNITURE
- label: unspecified
- model_extra: None
- model_fields_set: {'content_layer', 'self_ref', 'name'}
- name: _root_
- parent: None
- self_ref: #/furniture


  if hasattr(document, 'furniture'):
  furniture = document.furniture


## Exploring Chunking

In [20]:
print("\n=== UNDERSTANDING DOCLING CHUNKING ===")

# 1. Import the basic chunkers
from docling.chunking import BaseChunker, HierarchicalChunker, HybridChunker

# 2. Check what a chunker returns
print("Chunkers in Docling return DocChunk objects that contain:")
print("- text: The actual text content of the chunk")
print("- meta: Metadata about the chunk (headings, page numbers, etc.)")
print("- Chunks are returned as iterators, so we typically convert to list")

# 3. Let's examine a single chunk to understand its structure
chunker = HierarchicalChunker()  # Start with the hierarchical chunker
chunks = list(chunker.chunk(document))

if chunks:
    # Examine the first chunk in detail
    first_chunk = chunks[0]
    print("\nExample chunk structure:")
    print(f"- Type: {type(first_chunk).__name__}")
    print(f"- Text length: {len(first_chunk.text)} characters")
    print(f"- Text preview: {first_chunk.text[:100]}...")
    
    # Examine the metadata
    print("\nChunk metadata contains:")
    for attr in dir(first_chunk.meta):
        if not attr.startswith('_') and not callable(getattr(first_chunk.meta, attr)):
            value = getattr(first_chunk.meta, attr)
            if isinstance(value, (list, dict)):
                print(f"- {attr}: {type(value).__name__} with {len(value)} items")
            else:
                print(f"- {attr}: {value}")


=== UNDERSTANDING DOCLING CHUNKING ===
Chunkers in Docling return DocChunk objects that contain:
- text: The actual text content of the chunk
- meta: Metadata about the chunk (headings, page numbers, etc.)
- Chunks are returned as iterators, so we typically convert to list

Example chunk structure:
- Type: DocChunk
- Text length: 22 characters
- Text preview: Wed 2024-07-24 9:26 AM...

Chunk metadata contains:
- captions: None
- doc_items: list with 1 items
- excluded_embed: list with 4 items
- excluded_llm: list with 4 items
- headings: list with 1 items
- model_computed_fields: dict with 0 items
- model_config: dict with 0 items
- model_extra: None
- model_fields: dict with 6 items
- model_fields_set: {'captions', 'headings', 'doc_items', 'origin'}
- origin: mimetype='application/pdf' binary_hash=1563881226811200184 filename='grant_decision_email_single_page.pdf' uri=None
- schema_name: docling_core.transforms.chunker.DocMeta
- version: 1.0.0


In [21]:
# Compare different chunkers and their output
print("\n=== COMPARING DIFFERENT CHUNKERS ===")

# 1. HierarchicalChunker
hierarchical_chunker = HierarchicalChunker()
hierarchical_chunks = list(hierarchical_chunker.chunk(document))
print(f"HierarchicalChunker produced {len(hierarchical_chunks)} chunks")

# 2. HybridChunker with default settings
hybrid_chunker = HybridChunker(tokenizer=tokenizer)
hybrid_chunks = list(hybrid_chunker.chunk(document))
print(f"HybridChunker (default) produced {len(hybrid_chunks)} chunks")

# 3. HybridChunker with custom max_tokens
hybrid_chunker_custom = HybridChunker(tokenizer=tokenizer, max_tokens=1000)
hybrid_custom_chunks = list(hybrid_chunker_custom.chunk(document))
print(f"HybridChunker (max_tokens=1000) produced {len(hybrid_custom_chunks)} chunks")

# Print a summary of each chunker's output
print("\nChunker comparison summary:")
print(f"1. HierarchicalChunker: {len(hierarchical_chunks)} chunks")
print(f"   - First chunk length: {len(hierarchical_chunks[0].text)} chars")
print(f"   - First chunk headings: {hierarchical_chunks[0].meta.headings}")

print(f"\n2. HybridChunker (default): {len(hybrid_chunks)} chunks")
print(f"   - First chunk length: {len(hybrid_chunks[0].text)} chars")
print(f"   - First chunk headings: {hybrid_chunks[0].meta.headings}")

print(f"\n3. HybridChunker (max_tokens=100): {len(hybrid_custom_chunks)} chunks")
print(f"   - First chunk length: {len(hybrid_custom_chunks[0].text)} chars")
print(f"   - First chunk headings: {hybrid_custom_chunks[0].meta.headings}")


=== COMPARING DIFFERENT CHUNKERS ===
HierarchicalChunker produced 13 chunks
HybridChunker (default) produced 1 chunks
HybridChunker (max_tokens=1000) produced 1 chunks

Chunker comparison summary:
1. HierarchicalChunker: 13 chunks
   - First chunk length: 22 chars
   - First chunk headings: ['FuzeHub Fund <fund@fuzehub.com>']

2. HybridChunker (default): 1 chunks
   - First chunk length: 1992 chars
   - First chunk headings: ['FuzeHub Fund <fund@fuzehub.com>']

3. HybridChunker (max_tokens=100): 1 chunks
   - First chunk length: 1992 chars
   - First chunk headings: ['FuzeHub Fund <fund@fuzehub.com>']


In [22]:
# Investigate why HybridChunker is creating only one chunk
print("\n=== INVESTIGATING HYBRIDCHUNKER BEHAVIOR ===")

# 1. Check the total token count of the document
full_text = ""
for text_item in document.texts:
    full_text += text_item.text + " "
total_tokens = tokenizer.count_tokens(full_text)
print(f"Total document text: {len(full_text)} characters, {total_tokens} tokens")

# 2. Check if the document is small enough to fit in one chunk
print(f"Default max_tokens for HybridChunker: 2048")
print(f"Is document smaller than default max_tokens? {total_tokens < 2048}")

# 3. Try with explicit parameters and debug output
print("\n=== TRYING HYBRIDCHUNKER WITH EXPLICIT PARAMETERS ===")
hybrid_chunker_debug = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=100,  # Very small limit
    merge_peers=False,  # Don't merge sections
    min_chunk_chars=10,  # Allow very small chunks
    min_chunk_size_ratio=0.1  # Allow chunks as small as 10% of max_tokens
)

# Get chunks and examine them
debug_chunks = list(hybrid_chunker_debug.chunk(document))
print(f"HybridChunker with restrictive parameters produced {len(debug_chunks)} chunks")

# Check the token count of the single chunk if there's only one
if len(debug_chunks) == 1:
    chunk_tokens = tokenizer.count_tokens(debug_chunks[0].text)
    print(f"Single chunk token count: {chunk_tokens}")
    print(f"Exceeds max_tokens limit of 100? {chunk_tokens > 100}")
    
    # Check if the document has a structure that prevents splitting
    print("\nDocument structure analysis:")
    print(f"Number of text elements: {len(document.texts)}")
    print(f"Number of section headers: {sum(1 for item in document.texts if type(item).__name__ == 'SectionHeaderItem')}")
    
    # Try to understand why it's not splitting
    print("\nPossible reasons for not splitting:")
    print("1. Document might have a structure that HybridChunker considers atomic")
    print("2. There might be a minimum chunk size that prevents splitting")
    print("3. The chunker might be configured to keep certain elements together")


=== INVESTIGATING HYBRIDCHUNKER BEHAVIOR ===
Total document text: 2087 characters, 575 tokens
Default max_tokens for HybridChunker: 2048
Is document smaller than default max_tokens? True

=== TRYING HYBRIDCHUNKER WITH EXPLICIT PARAMETERS ===
HybridChunker with restrictive parameters produced 16 chunks


In [23]:
# Examine the chunks produced by the restrictive HybridChunker
print("\n=== EXAMINING HYBRIDCHUNKER CHUNKS ===")

hybrid_chunker_detailed = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=100,  # Small token limit
    merge_peers=False,  # Don't merge sections
    min_chunk_chars=10,  # Allow small chunks
    min_chunk_size_ratio=0.1  # Allow chunks as small as 10% of max_tokens
)

detailed_chunks = list(hybrid_chunker_detailed.chunk(document))
print(f"Number of chunks: {len(detailed_chunks)}")

# Examine each chunk
for i, chunk in enumerate(detailed_chunks):
    token_count = tokenizer.count_tokens(chunk.text)
    print(f"\nChunk {i+1}:")
    print(f"- Length: {len(chunk.text)} chars, {token_count} tokens")
    print(f"- Headings: {chunk.meta.headings}")
    
    # Get the first few words to understand the content
    preview = chunk.text[:50] + "..." if len(chunk.text) > 50 else chunk.text
    print(f"- Preview: {preview}")
    
    # Check if the chunk respects the token limit
    if token_count > 100:
        print("  WARNING: Chunk exceeds the max_tokens limit of 100")
        
        # Try to understand why
        doc_items = chunk.meta.doc_items if hasattr(chunk.meta, 'doc_items') else []
        item_types = [type(item).__name__ for item in doc_items]
        print(f"  Contains item types: {set(item_types)}")
        
        # Check if it's a single large text item that can't be split
        if len(doc_items) == 1:
            print(f"  Single item chunk - cannot be split further")


=== EXAMINING HYBRIDCHUNKER CHUNKS ===
Number of chunks: 16

Chunk 1:
- Length: 22 chars, 13 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: Wed 2024-07-24 9:26 AM

Chunk 2:
- Length: 44 chars, 17 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: To:  Samantha Roberts <sroberts@gc.cuny.edu>

Chunk 3:
- Length: 81 chars, 36 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: Cc:  Yuki Chen <xchen4@gc.cuny.edu>;  markk@kepcop...

Chunk 4:
- Length: 128 chars, 27 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: * This email originates from a sender outside of C...

Chunk 5:
- Length: 13 chars, 7 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: July 24, 2024

Chunk 6:
- Length: 26 chars, 6 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: Dear Dr. Samantha Roberts,

Chunk 7:
- Length: 346 chars, 84 tokens
- Headings: ['FuzeHub Fund <fund@fuzehub.com>']
- Preview: Thank you for  subming a gran