# Chunk text into smaller parts

Retrieving information from larger texts requires locating relevant information with the teyt body. To provide a finer granluarity to find information, we can split the document into smaller parts

In [1]:
from typing import List
import ipywidgets as widgets
from IPython.display import clear_output
from helpers import get_book, get_pdf, print_table
from customsplitter import CustomSplitterParagraphs, CustomSplitterSimilarity
from langchain_core.documents import Document
from langchain_text_splitters import (
    TextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter, NLTKTextSplitter, SpacyTextSplitter,
    Language
)
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
import re, itertools

# turn off warnings
import logging
logging.getLogger().setLevel(logging.ERROR)

import warnings
warnings.filterwarnings("ignore")

Device: cpu


### Retrieving the documents

In [14]:
sources = [
    ('A Study in Scarlet (en)', 244), 
    ('Pride and Prejudice (en)', 1342),
    ('Les Misérables (en)', 135),
    ('Buddenbrooks: Verfall einer Familie (de)', 34811),
    ('Les trois mousquetaires (fr)', 13951),
    ('Bajki (pl)', 27729),
    ('Buddah (zn)', 23585),
    ('Chapter 1: Introduction (en)', "pdfs/01_Introduction.pdf"), 
    ('Chapter 2: Evaluation (en)', "pdfs/02_ClassicalTextRetrieval.pdf"),
    ('Chapter 3: Classical Text Retrieval (en)', "pdfs/03_PerformanceEvaluation.pdf"),
    ('Chapter 4: Advanced Text Retrieval (en)', ".pdfs/04_AdvancedTextProcessing.pdf"),
]

# initialize splitters if not yet defined
if 'splitters' not in globals():
    splitters = {}
out_stats = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '40%'})
out_plot = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '50%'})
out_samples = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '50%'})

# split and print results
def split_and_print(docs: Document | List[Document], splitter: TextSplitter, chunk_ids: list[int] = None):
    if isinstance(docs, Document):
        docs = [docs]
    chunks = [(x.page_content, x.metadata['id']) for x in splitter.split_documents(docs)]
    with out_stats:
        doc_len = sum([len(doc.page_content) for doc in docs])
        clear_output()
        print(f'doucment length: {doc_len}')
        print(f'number of pages: {len(docs)}')
        print(f' document parts: {len(chunks)}')
        print(f'   sum of parts: {sum([len(part[0]) for part in chunks])}')
        print(f' parts overhead: {sum([len(part[0]) for part in chunks]) / doc_len:.2f}')
        print(f'avg part length: {sum([len(part[0]) for part in chunks]) / len(chunks):.2f}\n')
    with out_plot:
        clear_output()
        df = DataFrame([len(p[0]) for p in chunks], columns = ['length'])
        sns.displot(df, x='length', bins=50, height=3, aspect=2)
        plt.show()
    with out_samples:
        clear_output()
        print_table([(f'{chunks[i][1]}:{i+1}', re.sub(r'\s+',' ', chunks[i][0])) for i in chunk_ids if i<len(chunks)],['chunk', 'text'])


## Different splitting strategies

### Method 1: Splitting the text into fixed-sized chunks (word boundaries)
A simple way to chunk text is to split it into words, and then merge words until the resulting chunks reach a certain size. To remediate the impact of breaking sentences or paragraphs in the middle, we can overlap subsequent 
chunks by a defined number of words. This way, even if a paragraph is split, it is likely to be retained with the next chunk.

In [15]:
splitters["words"] = lambda size, overlap: CharacterTextSplitter(        
    separator = " ",
    chunk_size = size,
    chunk_overlap  = overlap,
    add_start_index = True
)

### Method 2: Splitting at sentence boundaries
Similar to the previous approach, we use sentences as the smallest units of text for chunking. This avoids abrupt sentence breaks but may introduce slight variations in chunk sizes, which are usually negligible.

In [16]:
splitters["sentence (nltk)"] = lambda size, overlap: NLTKTextSplitter(
    separator = "\n\n",
    chunk_size = size,
    chunk_overlap  = overlap,
    add_start_index = True
)

splitters["sentence (spacy)"] = lambda size, overlap: SpacyTextSplitter(
    separator = "\n\n",
    chunk_size = size,
    chunk_overlap  = overlap,
    add_start_index = True
)

### Method 3: splitting on structure

The concept is to divide text based on its structural elements. For instance, in books, we can split at pages, parts, chapters, sections, and paragraphs. Authors commonly use these structural elements to separate content, making them strong indicators of topic or aspect changes

In [17]:
splitters["chapters"] = lambda size, overlap: CharacterTextSplitter(
    separator = "\n\n\n\n",
    chunk_size = size,
    chunk_overlap  = overlap,
    add_start_index = True
)

splitters["paragraphs"] = lambda size, overlap: RecursiveCharacterTextSplitter(
    separators = ["\n\n\n\n", "\n\n\n", "\n\n"],
    chunk_size = size,
    chunk_overlap  = overlap,
    add_start_index = True
)

### Method 4: semantic splitting

The structural approach is often the simplest way to obtain semantically coherent chunks. However, when reliable structural context extraction is challenging (e.g., in web pages with varying header formats or scanned documents), we can extend the sentence-based splitting method: 
1) Define a similarity measure between sentences. 
2) Set minimum and maximum chunk sizes. 
3) Split the text into sentences using NLTK or spaCy (merge very short sentences to meet the minimum size). 
4) Merge neighboring chunks if they are similar (while ensuring they don't exceed the maximum size).

In [18]:
splitters["custom-paragraph"] = lambda size, overlap: CustomSplitterParagraphs()
splitters["custom-similarity"] = lambda size, overlap: CustomSplitterSimilarity()

## Putting all together

In [23]:
def chunk_text(source, splitter, size, overlap):
    clear_output()
    docs = None
    if isinstance(source, str):
        docs = get_pdf(source)
    elif isinstance(source, int):
        docs = [get_book(source)]
    if docs:
        split_and_print(docs, splitters[splitter](size, overlap), [5,6,7,8,9,25,26,27,28,29]) 

form_source = widgets.Dropdown(
    options=sources,
    value=sources[0][1],
    description='Text source:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '150px'},
)
form_splitter = widgets.Dropdown(
    options=splitters.keys(),
    value=list(splitters.keys())[0],
    description='Text splitter:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '150px'},
)
form_size = widgets.IntSlider(min=200, max=10000, value=1000, step=200, description='Chunk size:')
form_overlap = widgets.IntSlider(min=0, max=1000, value=200, step=50, description='Overlap:')
form_input = widgets.interactive(chunk_text, source=form_source, splitter=form_splitter, size=form_size, overlap=form_overlap)
display(widgets.VBox([form_input, out_stats, out_plot, out_samples]))

VBox(children=(interactive(children=(Dropdown(description='Text source:', layout=Layout(width='500px'), option…

## Hierarchical chunking

For RAG use cases, we aim to locate relevant chunks in our library and incorporate them into our prompt alongside the user's query. Embeddings (which we will discuss later) are most effective with smaller chunks, while text generation works better with more context. Earlier models had much smaller context windows, but modern language models have 100k or even 1m token context, allowing for much more information to be included.

The concept of hierarchical chunking involves using a larger chunk for context and a smaller chunk within that for retrieval. When a smaller chunk is found, it fills in the larger context. We can also add more context by including preceding and succeeding chunks, similar to how humans scan a book for more context

In [20]:
MAX_CONTEXT_SIZE = 30000
MIN_CONTEXT_SIZE = 1000

# initialize splitters if not yet defined
if 'splitters_context' not in globals():
    splitters_context = {}
    splitters_search = {}

splitters_context["chapters"] = lambda: CharacterTextSplitter(
    separator = "\n\n\n\n",
    chunk_size = MIN_CONTEXT_SIZE,
    chunk_overlap  = 0,
    add_start_index = True
)

splitters_context["paragraphs"] = lambda: RecursiveCharacterTextSplitter(
    separators = ["\n\n\n\n", "\n\n\n", "\n\n"],
    chunk_size = MIN_CONTEXT_SIZE,
    chunk_overlap  = 0,
    add_start_index = True
)

splitters_context['pages'] = lambda: CharacterTextSplitter(
    separator = " ",
    chunk_size = MAX_CONTEXT_SIZE,
    chunk_overlap  = 0,
    add_start_index = True
)

splitters_search["words"] = lambda size: CharacterTextSplitter(        
    separator = " ",
    chunk_size = size,
    chunk_overlap  = 0,
    add_start_index = True
)
splitters_search["sentence (nltk)"] = lambda size: NLTKTextSplitter(
    separator = "\n\n",
    chunk_size = size,
    chunk_overlap  = 0,
    add_start_index = True
)

splitters_search["sentence (spacy)"] = lambda size: SpacyTextSplitter(
    separator = "\n\n",
    chunk_size = size,
    chunk_overlap  = 0,
    add_start_index = True
)

### Splitting document hierarchically and print stats, plots, and samples

In [21]:
out_stats_h = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '40%'})
out_plot_h = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '50%'})
out_samples_h = widgets.Output(layout = {'padding': '0px 50px', 'min_width': '50%'})

# split and print results
def split_hierarchically_and_print(docs: Document | List[Document], splitter_context: TextSplitter, splitter_search: TextSplitter, chunk_ids: list[int] = None):
    contexts = [(x.page_content, x.metadata['id'], []) for i, x in enumerate(splitter_context.split_documents(docs))]
    for context in contexts:
        context[2].extend(splitter_search.split_text(context[0]))

    with out_stats_h:
        clear_output()
        doc_len = sum([len(doc.page_content) for doc in docs])
        print(f'   doucment length: {doc_len}')
        print(f'      num contexts: {len(contexts)}')
        print(f' num search chunks: {sum([len(c[2]) for c in contexts])}')
        print(f'avg context length: {sum([len(c[0]) for c in contexts]) / len(contexts):.2f}')
        print(f'  avg chunk length: {sum([sum(len(p) for p in c[2]) for c in contexts]) / sum([len(c[2]) for c in contexts]):.2f}\n')
    with out_plot_h:
        clear_output()
        df = DataFrame([len(c[0]) for c in contexts], columns = ['length'])
        sns.displot(df, x='length', bins=50, height=3, aspect=2)
        plt.show()
        df = DataFrame([len(c) for c in itertools.chain(*[c[2] for c in contexts])], columns = ['length'])
        sns.displot(df, x='length', bins=50, height=3, aspect=2)
        plt.show()
    with out_samples_h:
        clear_output()
        print_table([(f'{contexts[i][1]}:{i+1}', '<hr>'.join([re.sub(r'\s+',' ', c) for c in contexts[i][2]])) for i in chunk_ids if i<len(contexts)],['contexts', 'search chunks'])

### Interactive demo

In [None]:
def chunk_text(source, splitter_context, splitter_search, size):
    clear_output()
    docs = None
    if isinstance(source, str):
        docs = get_pdf(source)
    elif isinstance(source, int):
        docs = [get_book(source)]
    if docs:
        split_hierarchically_and_print(docs, splitters_context[splitter_context](), splitters_search[splitter_search](size), [4,9,14]) 

form_source_h = widgets.Dropdown(
    options=sources,
    value=sources[0][1],
    description='Text source:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '150px'},
)
form_splitter_context = widgets.Dropdown(
    options=splitters_context.keys(),
    value=list(splitters_context.keys())[0],
    description='Context splitter:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '150px'},
)
form_splitter_search = widgets.Dropdown(
    options=splitters_search.keys(),
    value=list(splitters_search.keys())[0],
    description='Search splitter:',
    layout=widgets.Layout(width='500px'),
    style={'description_width': '150px'},
)
form_size_h = widgets.IntSlider(min=200, max=2000, value=400, step=100, description='Chunk size:')
form_input_h = widgets.interactive(chunk_text, source=form_source, splitter_context=form_splitter_context, splitter_search=form_splitter_search, size=form_size_h)
display(widgets.VBox([form_input_h, out_stats_h, out_plot_h, out_samples_h]))

VBox(children=(interactive(children=(Dropdown(description='Text source:', layout=Layout(width='500px'), option…

---