### Semantic Chunking

- SemanticChunker is a document splitter that uses similarity between sentences to decide chunk boundaries.
- It Ensures that each chunk is semantically coherent anf not cut off mid-thought like traditional character/token splitters

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### 1.Document Segmentation

In [10]:
# Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")


## Sample text
text = """ 
Langchain is a framework for building applications with LLMs
Langchain provides modular abstraction to combine LLMs with tools like OpenAI and Pinecone.
You can create chains,agents,memory and retrievers
The Eiffel Tower is located in Paris
France is a popular tourist destination
"""

# Step1: Split into Sentences

sentences = [s.strip() for s in text.split("\n") if s.strip()]

#Step2: Embed Each Sentence

embeddings = model.encode(sentences)

#Step3: Initialize Parameters

threshold = 0.7 # control chunk tightness
chunks = []
current_chunk = [sentences[0]]

# Step4: Semantic grouping based on threshold
for i in range(1, len(sentences)):
    sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
    
    if sim >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk = [sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunk))

# Output
for idx, chunk in enumerate(chunks):
    print(f"\n chunk {idx+1}:\n{chunk}")







 chunk 1:
Langchain is a framework for building applications with LLMs Langchain provides modular abstraction to combine LLMs with tools like OpenAI and Pinecone.

 chunk 2:
You can create chains,agents,memory and retrievers

 chunk 3:
The Eiffel Tower is located in Paris

 chunk 4:
France is a popular tourist destination


### Building RAG Pipeline With Semantic Chunker(Modular Coding)

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [None]:
import os
os.environ["GROQ_API_KEY"]=""

In [11]:
### custom Semantic Chunker with threshold

class ThresholdSemanticChunker:
    def __init__(self,model_name="all-MiniLM-L6-v2",threshold=0.7):
        self.model = SentenceTransformer(model_name)
        self.threshold=threshold
    
    def split(self,text: str):
        sentences = [s.strip() for s in text.split("\n") if s.strip()]
        embeddings = self.model.encode(sentences) 
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentences[i]]

        chunks.append(" ".join(current_chunk))
        return chunks
    
    def split_documents(self,docs):
        result=[]
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))
        
        return result
    



In [12]:
## Sample text
sample_text = """ 
Langchain is a framework for building applications with LLMs
Langchain provides modular abstraction to combine LLMs with tools like OpenAI and Pinecone.
You can create chains,agents,memory and retrievers
The Eiffel Tower is located in Paris
France is a popular tourist destination
"""

doc = Document(page_content=sample_text)
doc

Document(metadata={}, page_content=' \nLangchain is a framework for building applications with LLMs\nLangchain provides modular abstraction to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains,agents,memory and retrievers\nThe Eiffel Tower is located in Paris\nFrance is a popular tourist destination\n')

In [13]:
# Chunking

chunker = ThresholdSemanticChunker(threshold=0.7)
chunks=chunker.split_documents([doc])
chunks



[Document(metadata={}, page_content='Langchain is a framework for building applications with LLMs Langchain provides modular abstraction to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains,agents,memory and retrievers'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris'),
 Document(metadata={}, page_content='France is a popular tourist destination')]

In [15]:
### vectorStore
from langchain_huggingface import HuggingFaceEmbeddings


## Initialize HuggingFace Embeddings (No API Key Required)
## First Train Your model
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print(hf_embeddings)

vector_store=FAISS.from_documents(chunks,hf_embeddings)

model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} query_encode_kwargs={} multi_process=False show_progress=False


In [16]:
retriever=vector_store.as_retriever()

In [18]:
### Prompt template

template = """Answer the question based in the following context:

{context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)

prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based in the following context:\n\n{context}\n\nQuestion: {question}\n')

In [26]:
llm = init_chat_model(model="groq:llama-3.1-8b-instant",temperature=0.4)

##LCEL Chain with retrieval

rag_chain=(
    RunnableMap(
    {
    "context": lambda x : retriever.invoke(x["question"]),
    "question": lambda x: x["question"],
    }
)
| prompt
| llm
| StrOutputParser()
)


query = {"question": "What is Langchain used for?"}
result = rag_chain.invoke(query)

print(result)

Based on the provided context, Langchain is a framework for building applications with Large Language Models (LLMs). It provides a modular abstraction to combine LLMs with tools like OpenAI and Pinecone.


### Semantic Chunker With Langchain

In [27]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

from langchain_community.document_loaders import TextLoader

In [30]:
## load the documents

loader = TextLoader("langchain_intro.txt")
docs=loader.load()

## Initialize Embedding model

embeddings = OpenAIEmbeddings()


## Create the Semantic Chunker

chunker=SemanticChunker(embeddings)
chunks = chunker.split_documents([docs])

###
for i, chunk in enumerate(chunks):
    print(f"\n chunk {i+1}:\n{chunk.page_content}")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable