#Prepared by Tamal Acharya

In [None]:
# Implementation on different chunking methods like naive chunking, recursive character text splitter,
# embedding chunking, agentic chunking, overlap chunking.

!pip install langchain-text-splitters langchain-community langchain langchain-chroma

import os
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOllama

# Create a dummy text file
with open("example_text.txt", "w") as f:
    f.write("This is the first sentence of a longer document. "
            "It covers various topics including technology and science. "
            "Here is the third sentence, introducing a new idea. "
            "Followed by a fourth sentence that expands on the previous one. "
            "Sentence five continues the discussion. "
            "And finally, the sixth and last sentence concludes this paragraph.")

# --- Naive Chunking ---
print("--- Naive Chunking ---")
# This is a very basic approach, splitting by a simple character or fixed length
# without much intelligence. Using CharacterTextSplitter for this.
with open("example_text.txt", "r") as f:
    text = f.read()

naive_splitter = CharacterTextSplitter(
    separator="\n", # Split by newlines (if any), or just one big chunk
    chunk_size=500, # Define a large chunk size to effectively get one chunk
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

naive_chunks = naive_splitter.create_documents([text])
print(f"Number of naive chunks: {len(naive_chunks)}")
for i, chunk in enumerate(naive_chunks):
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...") # Print first 100 chars

print("\n" + "="*50 + "\n")

# --- Recursive Character Text Splitter ---
print("--- Recursive Character Text Splitter ---")
# Splits recursively based on a list of characters. It tries to split by the
# first character, then the second if the first doesn't work, and so on.
# Useful for preserving structural integrity.
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

# Load the document using a loader (more standard practice)
loader = TextLoader("example_text.txt")
documents = loader.load()

recursive_chunks = recursive_splitter.split_documents(documents)
print(f"Number of recursive chunks: {len(recursive_chunks)}")
for i, chunk in enumerate(recursive_chunks):
    print(f"Chunk {i+1}: {chunk.page_content}")

print("\n" + "="*50 + "\n")

# --- Overlap Chunking ---
print("--- Overlap Chunking ---")
# Overlap chunking is not a separate *method* but a *parameter* used in most
# text splitters (like RecursiveCharacterTextSplitter). It ensures that
# chunks share some common text at the boundaries. This helps in RAG by
# providing context around the split points. The recursive_splitter already
# demonstrates overlap (chunk_overlap=20).
print("Overlap is demonstrated in the Recursive Character Text Splitter example above.")
print("The 'chunk_overlap' parameter specifies the number of characters to overlap.")

print("\n" + "="*50 + "\n")

# --- Embedding Chunking (Conceptual / RAG Integration) ---
print("--- Embedding Chunking (Conceptual / RAG Integration) ---")
# Embedding chunking isn't a direct text splitting algorithm itself.
# It's the *process* of taking text chunks (generated by a splitter) and
# converting them into numerical vector embeddings. These embeddings are then
# used for semantic search in a Vector Store as part of a RAG pipeline.
# We will demonstrate this by creating embeddings and a vector store from our chunks.

# Ensure Ollama is running locally or accessible
# You might need to install Ollama and pull a model like 'nomic-embed-text'
# !ollama pull nomic-embed-text # Example command if using local Ollama

embedding_model = OllamaEmbeddings(model="nomic-embed-text") # Use an appropriate embedding model

# We'll use the chunks generated by the recursive splitter
vectorstore = Chroma.from_documents(recursive_chunks, embedding_model)
retriever = vectorstore.as_retriever()

print("Embeddings created and stored in Chroma vectorstore.")
print("This vectorstore can now be used for semantic search.")

# Example semantic search (simulating the RAG retrieval step)
query = "What is the first sentence about?"
print(f"\nSearching for documents relevant to: '{query}'")
relevant_docs = retriever.invoke(query)
print("Retrieved documents:")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i+1}: {doc.page_content}")

print("\n" + "="*50 + "\n")

# --- Agentic Chunking (Conceptual) ---
print("--- Agentic Chunking (Conceptual) ---")
# Agentic chunking is a more advanced and evolving concept. It involves using
# an AI agent or a set of rules to decide *how* to split text, potentially
# considering the semantic meaning, topic changes, or future use cases.
# This is not a standard, readily available text splitter in libraries like
# LangChain yet, but rather an architectural pattern where an agent
# intelligently processes and segments data.

print("Agentic chunking is an advanced concept where an AI agent dynamically")
print("determines how to split text based on semantic understanding or task.")
print("This is typically implemented through more complex processing pipelines")
print("rather than a single text splitter class.")
print("Example scenario: An agent identifies key sections in a document and splits")
print("accordingly, or it uses context to decide where to split for optimal retrieval.")

print("\n" + "="*50 + "\n")

# --- Putting it together in a RAG Pipeline ---
print("--- RAG Pipeline Example ---")
# A simple RAG chain using the vector store created from recursive chunks.

# Ensure Ollama is running locally or accessible
# You might need to install Ollama and pull a model like 'llama2' or 'mistral'
# !ollama pull mistral # Example command if using local Ollama

llm = ChatOllama(model="mistral") # Use an appropriate LLM

prompt_template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use a maximum of three sentences and keep the answer concise.

Context: {context}

Question: {question}

Answer:"""

rag_prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

# Ask a question using the RAG pipeline
question_rag = "What topics are covered in the document?"
print(f"Asking RAG pipeline: '{question_rag}'")
response = rag_chain.invoke(question_rag)
print("RAG Answer:")
print(response)

# Clean up the dummy file
os.remove("example_text.txt")
print("\nCleaned up example_text.txt")
```

**Explanation:**

1.  **Installation:** We install the necessary LangChain libraries.
2.  **Dummy Data:** A simple text file `example_text.txt` is created to serve as our input document.
3.  **Naive Chunking:** We use `CharacterTextSplitter` with a large `chunk_size` and simple separator (`\n`) to simulate a naive split. This often results in large chunks or chunks broken at arbitrary points if no specific separator exists.
4.  **Recursive Character Text Splitter:** This is a more robust method. It attempts to split by a list of separators (`\n\n`, `\n`, ` `, `""`) in order. `chunk_size` defines the target size, and `chunk_overlap` specifies how many characters should overlap between consecutive chunks. This is crucial for RAG to maintain context.
5.  **Overlap Chunking:** Explained as a *parameter* (`chunk_overlap`) used within splitters like the recursive one, rather than a separate method.
6.  **Embedding Chunking (Conceptual/RAG Integration):** This section explains that 'embedding chunking' refers to the step of taking text chunks and converting them into numerical vectors (embeddings) for use in a vector store. We demonstrate this by creating `OllamaEmbeddings` and using them to build a `Chroma` vector store from the recursive chunks. This vector store is the heart of the RAG retrieval mechanism.
7.  **Agentic Chunking (Conceptual):** Describes this as an advanced pattern where an AI agent intelligently decides the splitting strategy, contrasting it with standard fixed-rule splitters. It's presented as a concept rather than a specific code implementation within standard libraries.
8.  **RAG Pipeline Example:** We build a simple RAG chain:
    *   We use the `retriever` created from our `Chroma` vector store.
    *   `RunnablePassthrough()` passes the user's question to both the retriever and the prompt.
    *   A `ChatPromptTemplate` is defined to instruct the LLM.
    *   An `Ollama` LLM is used (you need to have Ollama running with a model like `mistral`).
    *   `StrOutputParser()` converts the LLM's output to a string.
    *   The chain is invoked with a question, demonstrating how retrieval (finding relevant chunks via embeddings) and generation (LLM answering based on retrieved context) work together.

This tutorial provides a practical demonstration of common chunking techniques and conceptual explanations for more advanced ones, integrating them into a basic RAG pipeline using LangChain. Remember to replace placeholder values like `[your Cloud Platform project ID]` if you are integrating with Google Cloud Storage, although that part of the previous conversation was not directly used in the core chunking/RAG example.

In [None]:
# Fixed window chunking, fixed window with overlap chunking, Semantic chunking, Embedding chunking, Agentic chunking

# --- Fixed Window Chunking ---
print("--- Fixed Window Chunking ---")
# Simple fixed-size chunking without much regard for content boundaries.
fixed_window_splitter = CharacterTextSplitter(
    separator="", # No specific separator, just split by size
    chunk_size=50, # Define a fixed chunk size
    chunk_overlap=0, # No overlap
    length_function=len,
    is_separator_regex=False,
)

fixed_window_chunks = fixed_window_splitter.create_documents([text])
print(f"Number of fixed window chunks: {len(fixed_window_chunks)}")
for i, chunk in enumerate(fixed_window_chunks):
    print(f"Chunk {i+1}: {chunk.page_content}")

print("\n" + "="*50 + "\n")

# --- Fixed Window with Overlap Chunking ---
print("--- Fixed Window with Overlap Chunking ---")
# Fixed-size chunking with a specified overlap between chunks.
fixed_overlap_splitter = CharacterTextSplitter(
    separator="", # No specific separator, just split by size
    chunk_size=50, # Define a fixed chunk size
    chunk_overlap=10, # Define overlap size
    length_function=len,
    is_separator_regex=False,
)

fixed_overlap_chunks = fixed_overlap_splitter.create_documents([text])
print(f"Number of fixed window with overlap chunks: {len(fixed_overlap_chunks)}")
for i, chunk in enumerate(fixed_overlap_chunks):
    print(f"Chunk {i+1}: {chunk.page_content}")

print("\n" + "="*50 + "\n")

# --- Semantic Chunking ---
print("--- Semantic Chunking ---")
# Semantic chunking aims to split text based on semantic meaning or topic changes.
# This is often achieved by analyzing embeddings or using models to identify
# boundaries. LangChain's built-in splitters primarily focus on structural/character
# rules. A common approach for semantic chunking involves:
# 1. Splitting into smaller, overlap-heavy chunks (e.g., sentences or small paragraphs).
# 2. Embedding these small chunks.
# 3. Analyzing similarity between consecutive chunk embeddings.
# 4. Identifying "breaks" where similarity drops significantly.
# 5. Merging small chunks between breaks into larger "semantic" chunks.
# This requires more complex logic than a simple splitter class. We'll outline the process.

print("Semantic chunking involves splitting based on meaning/topic changes.")
print("A common approach:")
print("1. Split text into small units (e.g., sentences) with overlap.")
print("2. Embed these units.")
print("3. Analyze embedding similarity between consecutive units.")
print("4. Identify low-similarity points as potential chunk boundaries.")
print("5. Merge units between boundaries to form semantic chunks.")
print("This requires custom implementation or specialized libraries/pipelines.")

# Example (Conceptual): Split by sentence first
sentence_splitter = CharacterTextSplitter(
    separator=". ", # Simple sentence split (may not be perfect)
    chunk_size=None, # Get each sentence as a chunk
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)
sentences = sentence_splitter.split_text(text)
print(f"Split into {len(sentences)} potential semantic units (sentences).")
# Further steps (embedding, similarity analysis, merging) would follow here conceptually.

print("\n" + "="*50 + "\n")