# Text Chunking Strategies

## Overview

Explore different text chunking strategies: semantic, structural, sliding window, and table chunking for optimal document processing.


In [None]:
from semantica.parse import TextSplitter
import re


## Step 1: Prepare Sample Document


In [None]:
document = """
# Introduction to Knowledge Graphs

Knowledge graphs are powerful data structures that represent information as entities and their relationships. 
They enable semantic understanding and reasoning over complex data.

## What are Knowledge Graphs?

A knowledge graph is a graph-based data model used to represent knowledge. It consists of nodes (entities) 
and edges (relationships) that connect these entities. Knowledge graphs are widely used in search engines, 
recommendation systems, and AI applications.

## Applications

Knowledge graphs have numerous applications:
- Search engines use them to understand user queries
- Recommendation systems leverage them for personalized suggestions
- AI systems use them for reasoning and inference

## Conclusion

In summary, knowledge graphs provide a flexible and powerful way to represent and reason about complex information.
"""


## Step 2: Semantic Chunking


In [None]:
class SemanticChunker:
    def chunk(self, document, chunk_size=500):
        paragraphs = [p.strip() for p in document.split('\n\n') if p.strip()]
        
        chunks = []
        current_chunk = ""
        
        for para in paragraphs:
            if len(current_chunk) + len(para) <= chunk_size:
                current_chunk += para + "\n\n"
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = para + "\n\n"
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks

semantic_chunker = SemanticChunker()
semantic_chunks = semantic_chunker.chunk(document, chunk_size=500)

for i, chunk in enumerate(semantic_chunks, 1):
    print(f"Chunk {i}: {len(chunk)} characters")


## Step 3: Structural Chunking


In [None]:
class StructuralChunker:
    def chunk(self, document):
        chunks = []
        current_section = ""
        current_header = ""
        
        lines = document.split('\n')
        
        for line in lines:
            if line.startswith('#'):
                if current_section:
                    chunks.append({
                        'header': current_header,
                        'content': current_section.strip()
                    })
                current_header = line.strip()
                current_section = ""
            else:
                current_section += line + "\n"
        
        if current_section:
            chunks.append({
                'header': current_header,
                'content': current_section.strip()
            })
        
        return chunks

structural_chunker = StructuralChunker()
structural_chunks = structural_chunker.chunk(document)

for i, chunk in enumerate(structural_chunks, 1):
    header = chunk['header'][:50] if chunk['header'] else "No header"
    print(f"Chunk {i}: {header}... ({len(chunk['content'])} chars)")


## Step 4: Sliding Window Chunking


In [None]:
class SlidingWindowChunker:
    def chunk(self, document, window_size=200, overlap=50):
        words = document.split()
        chunks = []
        
        start = 0
        while start < len(words):
            end = min(start + window_size, len(words))
            chunk_words = words[start:end]
            chunks.append(' '.join(chunk_words))
            
            start += window_size - overlap
        
        return chunks

sliding_chunker = SlidingWindowChunker()
sliding_chunks = sliding_chunker.chunk(document, window_size=200, overlap=50)

for i, chunk in enumerate(sliding_chunks[:3], 1):
    print(f"Chunk {i}: {len(chunk)} characters")
if len(sliding_chunks) > 3:
    print(f"... and {len(sliding_chunks) - 3} more chunks")


## Step 5: Table Chunking


In [None]:
class TableChunker:
    def chunk(self, table_data):
        if isinstance(table_data, str):
            rows = [row.strip() for row in table_data.split('\n') if row.strip()]
            chunks = []
            for row in rows:
                if '|' in row:
                    chunks.append(row)
            return chunks
        elif isinstance(table_data, list):
            return [str(row) for row in table_data]
        else:
            return [str(table_data)]

table_data = """
| Name | Age | Role |
|------|-----|------|
| Alice | 30 | Engineer |
| Bob | 35 | Manager |
| Charlie | 28 | Developer |
"""

table_chunker = TableChunker()
table_chunks = table_chunker.chunk(table_data)

for i, chunk in enumerate(table_chunks, 1):
    print(f"Chunk {i}: {chunk[:50]}...")


## Summary

Chunking strategies:
- Semantic Chunking (by meaning/paragraphs)
- Structural Chunking (by document structure)
- Sliding Window Chunking (with overlap)
- Table Chunking (for structured data)


In [None]:
print("Text Chunking Strategies Complete")
