<a href="https://colab.research.google.com/github/priyal6/gen/blob/main/chunking_simple%2Badv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#fixed size chunking
from typing import List
import re


def word_splitter(source_text: str) ->List[str]:

  source_text = re.sub(r"\s+", " ", source_text.strip())
  return source_text.split(" ")

def get_chunks_fixed_size_with_overlap(
    text:str, chunk_size:int, overlap_fraction: float = 0.2) -> List[str]:
    words = word_splitter(text)
    overlap = int(chunk_size * overlap_fraction)
    chunks = []

    for i in range(0, len(words), chunk_size):
      start = max(i-overlap, 0)
      end = i + chunk_size
      chunk = " ".join(words[start:end])
      chunks.append(chunk.strip())
    return chunks
text = """Fixed-size chunking is the simplest approach.
It splits text into chunks of a predetermined size, often measured in tokens or characters.
Chunk overlap preserves context between boundaries."""

chunks = get_chunks_fixed_size_with_overlap(text, chunk_size=10, overlap_fraction=0.2)
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")

Chunk 1:
Fixed-size chunking is the simplest approach. It splits text into

Chunk 2:
text into chunks of a predetermined size, often measured in tokens or

Chunk 3:
tokens or characters. Chunk overlap preserves context between boundaries.



In [None]:
#recursive chunking
from typing import List

def recursive_chunking(text: str, max_chunk_size: int = 1000) -> List[str]:
  if len(text) <= max_chunk_size:
    return [text.strip()] if text.strip() else []

  separators = ["\n\n", "\n", ". ", " "]

  for sep in separators:
    if sep in text:
      parts = text.split(sep)
      chunks = []
      current_chunk = ""

      for part in parts:

        test_chunk = current_chunk + sep + part if current_chunk else part

        if len(test_chunk) <=max_chunk_size:
          current_chunk = test_chunk
        else:
          if current_chunk:
            chunks.append(current_chunk.strip())
          current_chunk = part

      if current_chunk:
        chunks.append(current_chunk.strip())

      final_chunks = []
      for chunk in chunks:
        if len(chunk) > max_chunk_size:
          final_chunks.extend(recursive_chunking(chunk, max_chunk_size))
        else:
          final_chunks.append(chunk)
      return [chunk for chunk in final_chunks if chunk]

  return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]


text = """Recursive chunking is a more nuanced approach.
It splits text using a prioritized list of separators, like paragraphs or sentences.
If a piece of text is still too big, it splits it again until it's small enough."""

chunks = recursive_chunking(text, max_chunk_size=60)
for i, c in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{c}\n")



Chunk 1:
Recursive chunking is a more nuanced approach.

Chunk 2:
It splits text using a prioritized list of separators, like

Chunk 3:
paragraphs or sentences.

Chunk 4:
If a piece of text is still too big, it splits it again

Chunk 5:
until it's small enough.



In [None]:
#document based chunking
from typing import List
import re


def markdown_document_chunking(text:str) -> List[str]:
  header_pattern = r'^#{1,6}\s+.+$'
  lines = text.split("\n")

  chunks = []
  current_chunk = []

  for line in lines:
    if re.match(header_pattern, line, re.MULTILINE):

      if current_chunk:
        chunk_text = '\n'.join(current_chunk).strip()
        # Append the previous chunk before starting a new one
        if chunk_text:
          chunks.append(chunk_text)
      current_chunk = [line]
    else:
      current_chunk.append(line)

  if current_chunk:
     chunk_text = '\n'.join(current_chunk).strip()
     if chunk_text:
      chunks.append(chunk_text)
  return chunks
text = """
# Introduction
This is the intro section.

## Background
Some background information.

## Methods
Details about methods used.

# Conclusion
Final thoughts and summary.
"""

chunks = markdown_document_chunking(text)
for i, c in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{c}\n")

Chunk 1:
# Introduction
This is the intro section.

Chunk 2:
## Background
Some background information.

Chunk 3:
## Methods
Details about methods used.

Chunk 4:
# Conclusion
Final thoughts and summary.



In [None]:
#semantic chunking
from sentence_transformers import SentenceTransformer, util

def semantic_chunking(text:str, threshold: float = 0.7, max_sentences: int = 5):
  model = SentenceTransformer('all-MiniLM-L6-v2')
  sentences = text.split('. ')
  embeddings = model.encode(sentences)

  chunks=[]
  current_chunk = [sentences[0]]

  for i in range(1, len(sentences)):
      similarity = util.cos_sim(embeddings[i-1], embeddings[i])
      if similarity < threshold or len(current_chunk) >= max_sentences:
        chunks.append('. '.join(current_chunk))
        current_chunk=[]
      current_chunk.append(sentences[i])
  if current_chunk:
    chunks.append('. '.join(current_chunk))
  return chunks