# Chunking parsed text

In [1]:
from chonkie import *

In [None]:
chunk_size = 2048
chunk_overlap = int(chunk_size * 0.2)
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'

# https://github.com/chonkie-inc/chonkie?tab=readme-ov-file#chunkers
chunkers_txt = {
    'token_txt': TokenChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
    'sentence_txt': SentenceChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
    'recursive_txt': RecursiveChunker(chunk_size=chunk_size),
    'semantic_txt': SemanticChunker(chunk_size=chunk_size, embedding_model=embedding_model),
    'sdpm_txt': SDPMChunker(chunk_size=chunk_size, embedding_model=embedding_model),
    'neural': NeuralChunker(),

    # Warning:
    # Token indices sequence length is longer than the specified maximum sequence length for this model (16333 > 256).
    # Running this sequence through the model will result in indexing errors
    # 'late': LateChunker(chunk_size=chunk_size, embedding_model=embedding_model),

    # Requires Gemini interface model and api_key
    # 'slumber': SlumberChunker(chunk_size=chunk_size)
}

chunkers_md = {
    'token_md': TokenChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
    'sentence_md': SentenceChunker.from_recipe('markdown', lang='en', chunk_size=chunk_size, chunk_overlap=chunk_overlap),
    'recursive_md': RecursiveChunker.from_recipe('markdown', lang='en', chunk_size=chunk_size),
    'semantic_md': SemanticChunker.from_recipe('markdown', lang='en', embedding_model=embedding_model, chunk_size=chunk_size),

    # Warning:
    # Token indices sequence length is longer than the specified maximum sequence length for this model (290 > 256).
    # Running this sequence through the model will result in indexing errors
    # 'sdpm_md': SDPMChunker.from_recipe('markdown', lang='en', chunk_size=chunk_size, embedding_model=embedding_model),

    # Warning:
    # Token indices sequence length is longer than the specified maximum sequence length for this model (699 > 256).
    # Running this sequence through the model will result in indexing errors
    # 'late_md': LateChunker.from_recipe('markdown', lang='en', chunk_size=chunk_size, embedding_model=embedding_model)
}

In [100]:
def log_chunk_strategies(text: str, chunkers: dict, output_name: str, file_type: str):
    for chunker_name, chunker in chunkers.items():
        with open(output_name + chunker_name + file_type, 'w') as file:
            chunks = chunker(text)
            file.write('\n\n'.join(map(lambda x: x.content, chunks)))

In [101]:
with open('parsers/outputs/pdfplumber.txt') as file:
    log_chunk_strategies(file.read(), chunkers_txt, 'outputs/chunks/pdfplumber/', '.txt')

Neural was quiet good - small chunks and accurate

Token / Sentence - fine for such big chunk sizes

SDPM / Semantic - bad

Recursive - I don't like it, looks like Token without overlap

In [102]:
with open('parsers/outputs/marker.md') as file:
    log_chunk_strategies(file.read(), chunkers_md, 'outputs/chunks/markdown/', '.md')

Same