### Semantic Chunking

- Semantic chunking is a document splitter that uses embedding similarity between sentences to decide chunk boundaries

- It ensures that each chunk is semantically coherent & not cutoff midthought like traditional character/token splitter

In [27]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [28]:
model = SentenceTransformer("all-MiniLM-L6-v2")

##Sample text
text = """
Langchain is a framework for building applications with LLMs.
Lanchain provides modular architecture abstractions to combine LLMs with tools like OpenAI & PineCone
You can create chains, agents, memory & retrievers
The Eiffel Tower is located in paris
France is a popular tourist destination 
"""

##Step1: Splitting in to sentences 
sentences=[s.strip() for s in text.split("\n") if s.strip()]

## Step2: embed each sentence
embeddings=model.encode(sentences)

## Step3: Initialize parameters
threshold=0.7
chunks=[]
current_chunks=[sentences[0]]

## step4: Semantic grouping based on threshold
for i in range(1, len(sentences)):
    sim=cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]

    if sim >= threshold:
        current_chunks.append(sentences[i])
    else:
        chunks.append(current_chunks)
        current_chunks=[sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunks))

#output the chunks
print("\n Semantic chunks")
for idx,chunk in enumerate(chunks):
    print(f"\n Chunk {idx+1}:\n{chunk}")




 Semantic chunks

 Chunk 1:
['Langchain is a framework for building applications with LLMs.']

 Chunk 2:
['Lanchain provides modular architecture abstractions to combine LLMs with tools like OpenAI & PineCone']

 Chunk 3:
['You can create chains, agents, memory & retrievers']

 Chunk 4:
['The Eiffel Tower is located in paris']

 Chunk 5:
France is a popular tourist destination


### RAG Pipeline with Modular Coding

In [29]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableLambda, RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os

In [30]:
from dotenv import load_dotenv
load_dotenv()
# os.setenv['GROQ_API_Key'] = os.getenv('GROQ_API_KEY')

True

##### STEP1: Advanced RAG - -- Custom-Semantic-Chunker

In [53]:
### Custom Semantic Chunker With Threshold

class ThresholdSematicChunker:
    def __init__(self,model_name="all-MiniLM-L6-v2",threshold=0.7):
        self.model=SentenceTransformer(model_name)
        self.threshold=threshold 

    def split(self, text: str):
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        embeddings = self.model.encode(sentences)
        chunks = []
        current_chunk = [sentences[0]]

        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i - 1]], [embeddings[i]])[0][0]
            if sim >= self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentences[i]]

        chunks.append(". ".join(current_chunk) + ".")
        return chunks
    
    def split_documents(self,docs):
        result=[]
        for doc in docs:
            for chunk in self.split(doc.page_content):
                result.append(Document(page_content=chunk, metadata=doc.metadata))

        return result

    

In [54]:
# Sample Text
sample_text="""
LangChain is a framework for building applications with LLMs.
Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
You can create chains, agents, memory, and retrievers.
The Eiffel Tower is located in Paris.
France is a popular tourist destination.
"""

doc=Document(page_content=sample_text)
doc

Document(metadata={}, page_content='\nLangChain is a framework for building applications with LLMs.\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory, and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination.\n')

In [55]:
### Chunking
chunker=ThresholdSematicChunker(threshold=0.7)
chunks=chunker.split_documents([doc])
chunks

[Document(metadata={}, page_content='LangChain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={}, page_content='You can create chains, agents, memory, and retrievers.'),
 Document(metadata={}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [None]:
## VectorStore
import os
from dotenv import load_dotenv
load_dotenv()


vectorstore=FAISS.from_documents(chunks,chunker.model)
retriever=vectorstore.as_retriever()

AttributeError: 'SentenceTransformer' object has no attribute 'embed_documents'

In [57]:
### Prompt Template

# ---5. Prompt Template
template="""Answer the question based on the following the context:

{context}

Question: {question}
"""

prompt=PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following the context:\n\n{context}\n\nQuestion: {question}\n')

In [64]:
## LLM

openAILLM = ("openai/gpt-oss-120b")
llm=init_chat_model(
    model=openAILLM,
    model_provider="openai",
    api_key=os.getenv("GROQ_API_KEY"),
    base_url="https://api.groq.com/openai/v1",
    temperature=0.6
)

### LCEL - LangChainExpressionLanguage
rag_chain=(
    RunnableMap(
        {
            "context": lambda x: retriever.invoke(x["question"]),
            "question": lambda x: x["question"]
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

# ---- 8.Run Query -----
query={"question": "What is langchain used for"}
result=rag_chain.invoke(query)

print(result)


NameError: name 'retriever' is not defined