In [1]:
# Import the functions
from app.document_conversion.document_pipeline import convert_document, chunk_converted_document, embed_and_upload_chunks
from app.document_conversion.chunk_visualizer import chunks_to_dataframe, display_chunk_samples



In [2]:
# what is the code to visualize the chunking strategies
from app.document_conversion.chunking import get_available_chunking_strategies

# Get and print available chunking strategies
strategies = get_available_chunking_strategies()
for strategy, description in strategies.items():
    print(f"{strategy}: {description}")


default: Standard chunking with moderate chunk size
balanced: Balanced approach between context preservation and chunk size
fine_grained: Smaller chunks for more precise retrieval
context: Larger chunks that preserve more context for QA and summarization
hierarchical: Chunks based on document's natural hierarchy and structure


In [3]:
available_strategies = list(get_available_chunking_strategies().keys())
print(f"Available chunking strategies: {available_strategies}")

Available chunking strategies: ['default', 'balanced', 'fine_grained', 'context', 'hierarchical']


In [4]:
# Process a document and get chunks for preview
doc_path = "/home/sng/nanobot-poc/data/original/Harvard/cns-safety-manual.pdf"
converted_doc = convert_document(doc_path, save_intermediate=True)


Docling is now converting /home/sng/nanobot-poc/data/original/Harvard/cns-safety-manual.pdf...
Saving docling and md...
Saved processed documents to /home/sng/nanobot-poc/data/parsed-doc/cns-safety-manual
Document conversion complete!


In [6]:
# Step 2: Try different chunking strategies on the same converted document

# Dictionary to store results for each strategy
results = {}

# Create named DataFrames for each strategy
for strategy in available_strategies:
    print(f"\n\n===== Testing {strategy} chunking strategy =====")
    
    # Chunk the document with this strategy
    chunks = chunk_converted_document(converted_doc, chunking_strategy=strategy)
    
    # Convert chunks to DataFrame for visualization and assign to named variable
    # This creates variables like df_default, df_balanced, etc.
    df_name = f"df_{strategy}"
    globals()[df_name] = chunks_to_dataframe(chunks)
    
    # Get reference to the DataFrame we just created
    current_df = globals()[df_name]
    
    # Store results
    results[strategy] = {
        "chunks": chunks,
        "dataframe": current_df,
        "total_chunks": len(current_df),
        "avg_length": current_df['text_length'].mean(),
        "min_length": current_df['text_length'].min(),
        "max_length": current_df['text_length'].max()
    }
    
    # Display sample chunks
    print(f"\nSample chunks for {strategy} strategy:")
    samples = display_chunk_samples(current_df, n_samples=3)
    print(samples)
    
    # Analyze the chunks
    print(f"Total chunks: {len(current_df)}")
    print(f"Average chunk length: {current_df['text_length'].mean():.1f} characters")
    print(f"Min chunk length: {current_df['text_length'].min()} characters")
    print(f"Max chunk length: {current_df['text_length'].max()} characters")
    
    # Confirm the DataFrame was created with the expected name
    print(f"Created DataFrame '{df_name}' with {len(current_df)} rows")




===== Testing default chunking strategy =====
Now chunking document using 'default' strategy...
Now processing chunks...
Done! Returning 83 chunks ready for preview
✅ Created DataFrame with 83 chunks

Sample chunks for default strategy:
Sample of 3 chunks:
   chunk_id                                       text_preview  text_length  \
0        31          § Outer gloves\n§ Face shield\n§ Apron...           36   
1        36  All chemical containers must be labeled with t...          253   
2        67  Biological effects can result from exposure to...         1900   

       metadata_filename metadata_page_numbers             metadata_title  \
0  cns-safety-manual.pdf                    23  Doff (Removal) sequence :   
1  cns-safety-manual.pdf                    25          Chemical Labeling   
2  cns-safety-manual.pdf                    45  Radio-Frequency Exposure:   

           metadata_headings metadata_chunking_strategy  
0  Doff (Removal) sequence :                    default  

In [7]:
# Compare strategies
print("\n\n===== Chunking Strategy Comparison =====")
for strategy, data in results.items():
    print(f"{strategy}: {data['total_chunks']} chunks, avg length: {data['avg_length']:.1f}")




===== Chunking Strategy Comparison =====
default: 83 chunks, avg length: 830.5
balanced: 87 chunks, avg length: 792.3
fine_grained: 269 chunks, avg length: 255.6
context: 83 chunks, avg length: 830.5
hierarchical: 265 chunks, avg length: 259.4


In [8]:
import pandas as pd

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Set width for better display
pd.set_option('display.max_colwidth', None)  # Show full content of each cell
pd.set_option('display.expand_frame_repr', True)  # Enable wrapping

In [10]:
df_hierarchical

Unnamed: 0,chunk_id,text,text_length,metadata_filename,metadata_page_numbers,metadata_title,metadata_headings,metadata_chunking_strategy
0,0,∞ Push a Yellow 'Emergency Gas Off' button\n∞ Evacuate the building immediately\n∞ Alert others on the way out\n∞ Meet outside in front of Pierce Hall,147,cns-safety-manual.pdf,4,IF A TOXIC GAS LEAK OCCURS,IF A TOXIC GAS LEAK OCCURS,hierarchical
1,1,∞ Evacuate the building immediately\n∞ Alert others on the way out\n∞ Meet outside in front of Pierce Hall,104,cns-safety-manual.pdf,4,IF YOU HEAR THE FIRE ALARM,IF YOU HEAR THE FIRE ALARM,hierarchical
2,2,"∞ If the victim is by themselves, stay close to assist.\n∞ Call 911 and demand an ambulance.\n∞ If requested, assist the person in removing contaminated clothing being careful not to become contaminated.\n∞ Advise person to stay in shower 5 minutes for HF contamination, and 15 minutes for all other chemicals.\n∞ If the chemical is Hydrofluoric Acid (HF) assist the victim in applying Calgonate (calcium gluconate) while wearing clean gloves.\n∞ Get the MSDS sheet (from Fire Command Center or print from on-line) and hand it to the Emergency Response Team or fire department.\n∞ Notify a staff member or call the Operations Center at 617-495-5560.",643,cns-safety-manual.pdf,4,IF YOU SEE SOMEONE UNDER THE EMERGENCY SHOWER OR EYE WASH,IF YOU SEE SOMEONE UNDER THE EMERGENCY SHOWER OR EYE WASH,hierarchical
3,3,∞ Approach the nearest emergency shower or eye wash and pull the handle to activate.\n∞ Demand help but remain under the emergency shower or eye wash.,149,cns-safety-manual.pdf,4,IF YOU GET CHEMICALS ON YOUR SKIN,IF YOU GET CHEMICALS ON YOUR SKIN,hierarchical
4,4,∞ Pull fire alarm located at emergency exit doors\n∞ Evacuate the building immediately.\n∞ Alert others on the way out.\n∞ Meet outside in front of Pierce Hall.,157,cns-safety-manual.pdf,4,IF A FIRE STARTS,IF A FIRE STARTS,hierarchical
5,5,"o Prior to access to CNS laboratories, ensure all employees, students, and/ outside users have been trained on this document.\no Enforce the contents of this document in areas that you supervise.\no Periodically review the contents of this document with the staff.\no Administer the reading and recordkeeping of this manual for all CNS staff and all users assigned to work in the CNS areas.",387,cns-safety-manual.pdf,5,CNS Management and Administration,CNS Management and Administration,hierarchical
6,6,"As the Safety Manual author, the LISE Health, and Safety Officer reviews this manual periodically to ensure its continued effectiveness. Serve as point of contact for any questions relating to this document or any other environmental health and safety concerns. Enforce the contents of this document in areas that you supervise.",329,cns-safety-manual.pdf,5,LISE Health and Safety Officer,LISE Health and Safety Officer,hierarchical
7,7,Safety: ensure a safe working environment.,42,cns-safety-manual.pdf,5,Purpose,Purpose,hierarchical
8,8,"Success: promote successful laboratory operation, scientifically and educationally productive.",94,cns-safety-manual.pdf,5,Purpose,Purpose,hierarchical
9,9,"Successful laboratory operation relies on individual user's understanding, participation, and selfdiscipline.",109,cns-safety-manual.pdf,5,Purpose,Purpose,hierarchical


In [9]:
# Style the DataFrame with pandas built-in styling
styled_df = df.style.set_properties(**{
    'white-space': 'pre-wrap', 
    'text-align': 'left',
    'font-size': '13px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#f0f0f0'), ('text-align', 'center')]
}])

In [10]:
styled_df

Unnamed: 0,chunk_id,text,text_length,metadata_filename,metadata_page_numbers,metadata_title,metadata_headings,metadata_chunking_strategy
0,0,∞ Push a Yellow 'Emergency Gas Off' button ∞ Evacuate the building immediately ∞ Alert others on the way out ∞ Meet outside in front of Pierce Hall,147,cns-safety-manual.pdf,4,IF A TOXIC GAS LEAK OCCURS,IF A TOXIC GAS LEAK OCCURS,balanced
1,1,∞ Evacuate the building immediately ∞ Alert others on the way out ∞ Meet outside in front of Pierce Hall,104,cns-safety-manual.pdf,4,IF YOU HEAR THE FIRE ALARM,IF YOU HEAR THE FIRE ALARM,balanced
2,2,"∞ If the victim is by themselves, stay close to assist. ∞ Call 911 and demand an ambulance. ∞ If requested, assist the person in removing contaminated clothing being careful not to become contaminated. ∞ Advise person to stay in shower 5 minutes for HF contamination, and 15 minutes for all other chemicals. ∞ If the chemical is Hydrofluoric Acid (HF) assist the victim in applying Calgonate (calcium gluconate) while wearing clean gloves. ∞ Get the MSDS sheet (from Fire Command Center or print from on-line) and hand it to the Emergency Response Team or fire department. ∞ Notify a staff member or call the Operations Center at 617-495-5560.",643,cns-safety-manual.pdf,4,IF YOU SEE SOMEONE UNDER THE EMERGENCY SHOWER OR EYE WASH,IF YOU SEE SOMEONE UNDER THE EMERGENCY SHOWER OR EYE WASH,balanced
3,3,∞ Approach the nearest emergency shower or eye wash and pull the handle to activate. ∞ Demand help but remain under the emergency shower or eye wash.,149,cns-safety-manual.pdf,4,IF YOU GET CHEMICALS ON YOUR SKIN,IF YOU GET CHEMICALS ON YOUR SKIN,balanced
4,4,∞ Pull fire alarm located at emergency exit doors ∞ Evacuate the building immediately. ∞ Alert others on the way out. ∞ Meet outside in front of Pierce Hall.,157,cns-safety-manual.pdf,4,IF A FIRE STARTS,IF A FIRE STARTS,balanced
5,5,"o Prior to access to CNS laboratories, ensure all employees, students, and/ outside users have been trained on this document. o Enforce the contents of this document in areas that you supervise. o Periodically review the contents of this document with the staff. o Administer the reading and recordkeeping of this manual for all CNS staff and all users assigned to work in the CNS areas.",387,cns-safety-manual.pdf,5,CNS Management and Administration,CNS Management and Administration,balanced
6,6,"As the Safety Manual author, the LISE Health, and Safety Officer reviews this manual periodically to ensure its continued effectiveness. Serve as point of contact for any questions relating to this document or any other environmental health and safety concerns. Enforce the contents of this document in areas that you supervise.",329,cns-safety-manual.pdf,5,LISE Health and Safety Officer,LISE Health and Safety Officer,balanced
7,7,"Safety: ensure a safe working environment. Success: promote successful laboratory operation, scientifically and educationally productive. Successful laboratory operation relies on individual user's understanding, participation, and selfdiscipline. Everyone, whether faculty member, student user, or staff engineer, is equally important to the success of the protocols established at CNS. Successful operation is a shared responsibility among all users and staff members. Satisfaction: communicate operational strategies that satisfy user's needs. Awareness: Provide guidance and basic awareness. Prepare experimentalists for their future roles.",645,cns-safety-manual.pdf,5,Purpose,Purpose,balanced
8,8,"This manual is required reading for all employees and users of CNS laboratories. This manual describes the safety hazards, engineering controls, and safety policies common to laboratories. Study and mastery of the material in this manual are obligatory but insufficient for laboratory access. Each laboratory and instrument also have dedicated training including additional safety detail.",388,cns-safety-manual.pdf,6,Scope,Scope,balanced
9,9,"SDS (Safety Data Sheets) Function: For determining chemical hazards and recommended precautions for use. Use: Look up information on all chemicals prior to use. Locations: If you have a Harvard ID Key access all SDS's on the Harvard EHS website https://www.ehs.harvard.edu/news/material-safety-data-sheets-msds Hard copies in the Fire Command Center LISE lobby. Nitrile Gloves Function : Protects cleanroom surfaces from contamination and offers wearer some splash protection against most chemicals. Use : Don before entering cleanroom, Soft Materials Cleanroom (SMCR), North Materials Synthesis rooms G06 and G05. In other labs such as B15A (imaging sample prep room) required when handling chemicals. If you suspect chemical contamination, replace gloves as soon as possible. Location : At or near the entrances to each area.",827,cns-safety-manual.pdf,6,Emergency & Personal Protective Equipment,Emergency & Personal Protective Equipment,balanced
