In [1]:
from app.database.db_inspector import (
    get_chunks_by_strategy,
    get_chunks_by_filename,
    chunks_to_dataframe,
    inspect_database,
    get_chunks_by_range,
    analyze_chunks,
    print_chunk_sample
)

In [11]:
import pandas as pd

# Set display options to show all rows and columns without truncation
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Use full width of the screen
pd.set_option('display.max_colwidth', None)  # Show full content of each cell



In [2]:
inspect_database()

‚úÖ Connected to database successfully
üìä Table contains 232 rows

üìã Table columns:
  ‚Ä¢ id (integer)
  ‚Ä¢ text (text)
  ‚Ä¢ vector (USER-DEFINED)
  ‚Ä¢ metadata (jsonb)

üîë Metadata fields found in samples:
  ‚Ä¢ filename
  ‚Ä¢ page_numbers
  ‚Ä¢ title

üìù Sample metadata record:
  ‚Ä¢ title: 1.0 Purpose
  ‚Ä¢ filename: cns-user-manual.pdf
  ‚Ä¢ page_numbers: [1]

‚úÖ Database inspection complete


In [8]:
data = get_chunks_by_range(start_row=159, end_row=232)

‚úÖ Retrieved chunks 159 to 232 of 232 total


In [14]:
df = chunks_to_dataframe(data)
df['text']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [6]:
# Get chunks with the 'balanced' strategy
fine_grained_chunks = get_chunks_by_strategy(strategy="fine_grained")

‚úÖ Retrieved 74 chunks with 'fine_grained' strategy


In [7]:



# Analyze the chunks
stats = analyze_chunks(fine_grained_chunks)
print(f"Found {stats['count']} fine_grained chunks")
print(f"Average chunk length: {stats['text_length']['avg']:.1f} characters")
#Print a sample of chunks
print_chunk_sample(fine_grained_chunks, count=2)

Found 74 fine_grained chunks
Average chunk length: 328.2 characters

--- Chunk 1/2 (ID: 194) ---
Text: 1.1 This document describes the policies and administrative requirements for use of the laboratories of the Center for Nanoscale Systems.
Metadata:
  ‚Ä¢ title: 1.0 Purpose
  ‚Ä¢ filename: cns-user-manual.pdf
  ‚Ä¢ headings: ['1.0 Purpose']
  ‚Ä¢ page_numbers: [1]
  ‚Ä¢ chunking_strategy: fine_grained

--- Chunk 2/2 (ID: 195) ---
Text: 2.1 The User agrees to observe all applicable governmental, Harvard University, and CNS policies, rules and regulations that pertain to his/her conduct on campus and at CNS facilities.  For more in...
Metadata:
  ‚Ä¢ title: 2.0 University Policies
  ‚Ä¢ filename: cns-user-manual.pdf
  ‚Ä¢ headings: ['2.0 University Policies']
  ‚Ä¢ page_numbers: [1]
  ‚Ä¢ chunking_strategy: fine_grained


In [None]:
# Get chunks from a specific file
harvard_chunks = get_chunks_by_filename(filename="cns-user-manual.pdf")
print(f"Found {len(harvard_chunks)} chunks from Harvard documents")

‚úÖ Retrieved 74 chunks matching filename 'cns-user-manual.pdf'
Found 74 chunks from Harvard documents


In [6]:
df_harvard = chunks_to_dataframe(harvard_chunks)
df_harvard.tail()

Unnamed: 0,id,text,text_length,meta_title,meta_filename,meta_page_numbers,meta_headings,meta_chunking_strategy
69,183,"as noted below, any intellectual property or p...",941,CNS USER MANUAL,cns-user-manual.pdf,[11],[CNS USER MANUAL],default
70,184,18.1 The non-Harvard organization sending a Us...,1513,18.0 Liability,cns-user-manual.pdf,[11],[18.0 Liability],default
71,185,18.6 The non-Harvard organization sending a Us...,218,CNS USER MANUAL,cns-user-manual.pdf,[12],[CNS USER MANUAL],default
72,186,"19.1 CNS operates as a shared-use facility, an...",856,19.0 Intellectual Property,cns-user-manual.pdf,[12],[19.0 Intellectual Property],default
73,187,20.1 The User and User's host institution furt...,564,20.0 Confidentiality,cns-user-manual.pdf,[12],[20.0 Confidentiality],default
