### Semantic Chunking
- SemanticChunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.

- It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitters.

In [None]:
# Import necessary libraries for semantic chunking
from sentence_transformers import SentenceTransformer  # For generating sentence embeddings
from sklearn.metrics.pairwise import cosine_similarity  # For calculating similarity between embeddings
import numpy as np  # For numerical operations

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

# Initialize Google Gemini embeddings model
# The gemini-embedding-001 model produces 3072-dimensional vectors
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# Sample text for demonstration - contains related and unrelated sentences
text="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

# Step 1: Split text into individual sentences
sentences=[s.strip() for s in text.split("\n") if s.strip()]

# Step 2: Generate embeddings for each sentence
vectors =embeddings.embed_documents(sentences)

# Step 3: Initialize chunking parameters
threshold = 0.7  # Similarity threshold - higher means stricter grouping
chunks = []  # List to store final chunks
current_chunk=[sentences[0]]  # Start with first sentence

# Step 4: Group sentences based on semantic similarity
for i in range(1, len(sentences)):
    # Calculate cosine similarity between consecutive sentences
    sim = cosine_similarity(
        [vectors[i - 1]],
        [vectors[i]]
    )[0][0]

    # If similarity is above threshold, add to current chunk
    if sim>=threshold:
        current_chunk.append(sentences[i])
    else:
        # Otherwise, finalize current chunk and start new one
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

# Don't forget the last chunk
chunks.append(" ".join(current_chunk))

# Display results
print("\n📌 Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")


📌 Semantic Chunks:

Chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone. You can create chains, agents, memory, and retrievers.

Chunk 2:
The Eiffel Tower is located in Paris. France is a popular tourist destination.


In [None]:
# Alternative implementation using SentenceTransformer directly
model=SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, fast model

# Same sample text for comparison
text="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

# Step 1: Split into sentences
sentences=[s.strip() for s in text.split("\n") if s.strip()]

# Step 2: Generate embeddings using SentenceTransformer
embeddings=model.encode(sentences)

# Step 3: Initialize chunking parameters
threshold = 0.7  # Same threshold for comparison
chunks = []
current_chunk=[sentences[0]]

# Step 4: Semantic grouping logic (same as above)
for i in range(1, len(sentences)):
    sim = cosine_similarity(
        [embeddings[i - 1]],
        [embeddings[i]]
    )[0][0]

    if sim>=threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunk))

# Output comparison results
print("\n📌 Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")


📌 Semantic Chunks:

Chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
You can create chains, agents, memory, and retrievers.

Chunk 3:
The Eiffel Tower is located in Paris.

Chunk 4:
France is a popular tourist destination.


### RAG Pipeline Modular Coding

In [None]:
# Import required libraries for RAG pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableLambda, RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os

# Environment setup
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API key with error handling
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in environment variables")

print("✓ API keys loaded successfully")

✓ API keys loaded successfully


In [None]:
class ThresholdSemanticChunker:
    """
    Custom semantic chunker that splits text based on embedding similarity threshold.
    
    This chunker uses sentence embeddings to determine semantic boundaries,
    ensuring chunks contain semantically related content.
    
    Args:
        model_name (str): Name of the SentenceTransformer model to use
        threshold (float): Cosine similarity threshold (0-1) for grouping sentences
    """
    
    def __init__(self, model_name="all-MiniLM-L6-v2", threshold=0.7):
        """
        Initialize the semantic chunker.
        
        Args:
            model_name (str): SentenceTransformer model name
            threshold (float): Similarity threshold for chunking (0.0 to 1.0)
        """
        self.model = SentenceTransformer(model_name)
        self.threshold = threshold 

    def split(self, text: str):
        """
        Split text into semantic chunks based on similarity threshold.
        
        Args:
            text (str): Input text to be chunked
            
        Returns:
            list: List of semantically coherent text chunks
        """
        # Split text into sentences using period as delimiter
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        
        # Generate embeddings for all sentences at once (more efficient)
        embeddings = self.model.encode(sentences)
        
        # Initialize chunking variables
        chunks = []
        current_chunk = [sentences[0]]  # Start with first sentence

        # Iterate through sentences and group by similarity
        for i in range(1, len(sentences)):
            # Calculate similarity between consecutive sentences
            sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
            
            if sim >= self.threshold:
                # High similarity - add to current chunk
                current_chunk.append(sentences[i])
            else:
                # Low similarity - finalize current chunk and start new one
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]

        # Add the final chunk
        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_documents(self, docs):
        """
        Split multiple Document objects into semantic chunks.
        
        Args:
            docs (list): List of langchain Document objects
            
        Returns:
            list: List of Document objects with semantically chunked content
        """
        result = []
        
        # Process each document individually
        for doc in docs:
            # Split document content and preserve metadata
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))

        return result

In [None]:
# Create sample document for testing
sample_text = """
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

# Wrap text in Document object (LangChain's standard format)
doc = Document(page_content=sample_text)
doc

Document(metadata={}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [None]:
# Test the custom semantic chunker
chunker = ThresholdSemanticChunker(threshold=0.7)  # Use 0.7 similarity threshold
chunks = chunker.split_documents([doc])  # Split the document
chunks

[Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [None]:
# Create vector store for RAG pipeline
import os

# Initialize Google embeddings for vector storage
embedding = GoogleGenerativeAIEmbeddings(api_key=GOOGLE_API_KEY, model="models/gemini-embedding-001")

# Create FAISS vector store from chunked documents
vectorstore = FAISS.from_documents(chunks, embedding)

# Create retriever interface for querying
retriever = vectorstore.as_retriever()

In [None]:
# Define prompt template for RAG responses
template = """Answer the question based on the following context:

{context}

Question: {question}
"""

# Create prompt template object
prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\n')

In [None]:
# Initialize the language model for generating responses
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",    # Latest fast Gemini model
    temperature=0,               # Deterministic output for consistency
    max_tokens=None,            # Use model default
    timeout=None,               # No timeout limit
    max_retries=2,              # Retry failed requests
)

# Create RAG chain using LangChain Expression Language (LCEL)
rag_chain = (
    # Step 1: Create parallel map with context retrieval and question passing
    RunnableMap(
        {
        "context": lambda x: retriever.invoke(x["question"]),  # Retrieve relevant chunks
        "question": lambda x: x["question"],                   # Pass through question
        }
    )
    | prompt        # Step 2: Format prompt with context and question
    | llm          # Step 3: Generate response using LLM
    | StrOutputParser()  # Step 4: Parse response to string
)

# Test the complete RAG pipeline
query = {"question": "What is LangChain used for?"}
result = rag_chain.invoke(query)

print(result)

LangChain is a framework for building applications with LLMs. It provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.


### Semantic chunker With Langchain

In [None]:
# Import LangChain's built-in semantic chunker
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import TextLoader

In [None]:
# Load documents from text file
loader = TextLoader("langchain-intro.txt")
docs = loader.load()

# Initialize embedding model for LangChain's semantic chunker
embeddings = GoogleGenerativeAIEmbeddings(api_key=GOOGLE_API_KEY, model="models/gemini-embedding-001")

# Create LangChain's built-in semantic chunker
chunker = SemanticChunker(embeddings)

# Split documents using the built-in chunker
chunks = chunker.split_documents(docs)

# Display results from built-in chunker
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i+1}:\n{chunk.page_content}")


 chunk 1:
LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone. You can create chains, agents, memory, and retrievers. The Eiffel Tower is located in Paris.

 chunk 2:
France is a popular tourist destination.


In [None]:
# This cell is empty - can be used for additional experiments or notes