In [3]:
# Import the functions
from app.document_conversion.document_pipeline import convert_document, chunk_converted_document
from app.document_conversion.chunk_visualizer import chunks_to_dataframe, display_chunk_samples




In [4]:
# what is the code to visualize the chunking strategies
from app.document_conversion.chunking import get_available_chunking_strategies

# Get and print available chunking strategies
strategies = get_available_chunking_strategies()
for strategy, description in strategies.items():
    print(f"{strategy}: {description}")


default: Standard chunking with moderate chunk size
balanced: Balanced approach between context preservation and chunk size
fine_grained: Smaller chunks for more precise retrieval
context: Larger chunks that preserve more context for QA and summarization
hierarchical: Chunks based on document's natural hierarchy and structure


In [5]:
available_strategies = list(get_available_chunking_strategies().keys())
print(f"Available chunking strategies: {available_strategies}")

Available chunking strategies: ['default', 'balanced', 'fine_grained', 'context', 'hierarchical']


In [6]:
# Process a document and get chunks for preview
doc_path = "/home/sng/nanobot-poc/data/original/ASRC/tool_manuals/NIKON NIS ELEMENTS D Software Manual.pdf"
converted_doc = convert_document(doc_path, save_intermediate=True)


Docling is now converting /home/sng/nanobot-poc/data/original/ASRC/tool_manuals/NIKON NIS ELEMENTS D Software Manual.pdf...
Saving docling and md...
Saved processed documents to /home/sng/nanobot-poc/data/parsed-doc/NIKON NIS ELEMENTS D Software Manual
Document conversion complete!


In [None]:
# Step 2: Try different chunking strategies on the same converted document

# Dictionary to store results for each strategy
results = {}

# Create named DataFrames for each strategy
for strategy in available_strategies:
    print(f"\n\n===== Testing {strategy} chunking strategy =====")
    
    # Chunk the document with this strategy
    chunks = chunk_converted_document(converted_doc, chunking_strategy=strategy)
    
    # Convert chunks to DataFrame for visualization and assign to named variable
    # This creates variables like df_default, df_balanced, etc.
    df_name = f"df_{strategy}"
    globals()[df_name] = chunks_to_dataframe(chunks)
    
    # Get reference to the DataFrame we just created
    current_df = globals()[df_name]
    
    # Store results
    results[strategy] = {
        "chunks": chunks,
        "dataframe": current_df,
        "total_chunks": len(current_df),
        "avg_length": current_df['text_length'].mean(),
        "min_length": current_df['text_length'].min(),
        "max_length": current_df['text_length'].max()
    }
    
    # Display sample chunks
    print(f"\nSample chunks for {strategy} strategy:")
    samples = display_chunk_samples(current_df, n_samples=3)
    print(samples)
    
    # Analyze the chunks
    print(f"Total chunks: {len(current_df)}")
    print(f"Average chunk length: {current_df['text_length'].mean():.1f} characters")
    print(f"Min chunk length: {current_df['text_length'].min()} characters")
    print(f"Max chunk length: {current_df['text_length'].max()} characters")
    
    # Confirm the DataFrame was created with the expected name
    print(f"Created DataFrame '{df_name}' with {len(current_df)} rows")


In [None]:
# Compare strategies
print("\n\n===== Chunking Strategy Comparison =====")
for strategy, data in results.items():
    print(f"{strategy}: {data['total_chunks']} chunks, avg length: {data['avg_length']:.1f}")


In [8]:
import pandas as pd

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Set width for better display
pd.set_option('display.max_colwidth', None)  # Show full content of each cell
pd.set_option('display.expand_frame_repr', True)  # Enable wrapping

In [None]:
df_context

In [9]:
# Style the DataFrame with pandas built-in styling
styled_df = df.style.set_properties(**{
    'white-space': 'pre-wrap', 
    'text-align': 'left',
    'font-size': '13px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#f0f0f0'), ('text-align', 'center')]
}])

In [None]:
styled_df