# Pre-Retrieval Optimization

## Better Chunking Strategies

### Smaller Chunks for Embedding, Larger Chunks for Context

In [None]:
pip install langchain nltk faiss-cpu tiktoken openai

In [None]:
import nltk
nltk.download('punkt')

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Small chunks for embedding
small_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Step 2: Larger chunks for context
large_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100)

# Step 3: Link small chunks to large ones
def link_small_to_large_chunks(document_text):
    small_chunks = small_splitter.split_text(document_text)
    large_chunks = large_splitter.split_text(document_text)

    linked_chunks = []
    for small in small_chunks:
        parent = next((large for large in large_chunks if small in large), None)
        linked_chunks.append({"chunk": small, "context": parent or document_text})
    
    return linked_chunks


### Hierarchical Chunking

In [17]:
def hierarchical_chunking(document_text):
    # Level 1: Paragraphs
    paragraphs = document_text.split("\n\n")

    # Level 2: Sentences within paragraphs
    sentence_chunks = []
    for para in paragraphs:
        sentences = nltk.sent_tokenize(para)
        sentence_chunks.append(sentences)

    # Level 3: Entire document
    return {
        "document": document_text,
        "paragraphs": paragraphs,
        "sentence_chunks": sentence_chunks,
    }


### Contextual Chunking

In [20]:
import re
from typing import List

def semantic_chunking(document_text: str, max_chunk_length=500) -> List[str]:
    sentences = nltk.sent_tokenize(document_text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks
