## Hybrid Retriever- Combining Dense And Sparse Retriever

In [3]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document
from langchain_community.retrievers import BM25Retriever

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)

In [None]:
# Step1: Loaders
print("\n PyMuPDFLoader")
try:
    loader = PyMuPDFLoader("bengal.pdf")
    docs = loader.load()

    print(f"  Loaded {len(docs)} pages")
    print(f"  Includes detailed metadata")
    print(docs)
except Exception as e:
    print(f"  Error: {e}")


 PyMuPDFLoader
  Loaded 5 pages
  Includes detailed metadata
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-14T11:37:04+05:30', 'source': 'bengal.pdf', 'file_path': 'bengal.pdf', 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': 'Saikat Santra', 'subject': '', 'keywords': '', 'moddate': '2025-09-14T11:37:04+05:30', 'trapped': '', 'modDate': "D:20250914113704+05'30'", 'creationDate': "D:20250914113704+05'30'", 'page': 0}, page_content='Bengal: A Historical, Cultural, and Socio-Economic Study \nIntroduction \nBengal, one of the most historically significant regions of South Asia, is today divided \ninto the Indian state of West Bengal and the sovereign nation of Bangladesh. With a \ncombined population of over 250 million people, Bengal represents one of the most \ndensely populated and culturally vibrant areas of the world. For centuries, the region \nhas been known as a land of a

In [16]:
#step 2 : Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,    # size of each chunk
    chunk_overlap=80   # overlap to maintain context
)

# Split documents into chunks
docs = text_splitter.split_documents(docs)
print(f"Total chunks: {len(docs)}")

Total chunks: 31


In [17]:
for i, chunk in enumerate(docs[:2]):  # Display first 2 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(f"Content: {chunk.page_content[:200]}...")  # Print first 200 characters
    print(f"Metadata: {chunk.metadata}")


--- Chunk 1 ---
Content: Bengal: A Historical, Cultural, and Socio-Economic Study 
Introduction 
Bengal, one of the most historically significant regions of South Asia, is today divided 
into the Indian state of West Bengal a...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-14T11:37:04+05:30', 'source': 'bengal.pdf', 'file_path': 'bengal.pdf', 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': 'Saikat Santra', 'subject': '', 'keywords': '', 'moddate': '2025-09-14T11:37:04+05:30', 'trapped': '', 'modDate': "D:20250914113704+05'30'", 'creationDate': "D:20250914113704+05'30'", 'page': 0}

--- Chunk 2 ---
Content: densely populated and culturally vibrant areas of the world. For centuries, the region 
has been known as a land of abundance, thanks to the fertility of the Ganges-
Brahmaputra delta. It has also bee...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microso

In [18]:
# Step 3: Dense Retriever (FAISS + HuggingFace)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
dense_vectorstore = FAISS.from_documents(docs, embedding_model)
dense_retriever = dense_vectorstore.as_retriever()

In [20]:
### Sparse Retriever(BM25)
sparse_retriever=BM25Retriever.from_documents(docs)
sparse_retriever.k=3 ##top- k documents to retriever

## step 4 : Combine with Ensemble Retriever
hybrid_retriever=EnsembleRetriever(
    retrievers=[dense_retriever,sparse_retriever],
    weight=[0.7,0.3]
)


In [21]:
hybrid_retriever


EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021E66B0B0D0>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x0000021E151C5090>, k=3)], weights=[0.5, 0.5])

In [22]:
# Step 5: Query and get results
query = "what is history of bengal?"
results = hybrid_retriever.invoke(query)

# Step 6: Print results
for i, doc in enumerate(results):
    print(f"\n🔹 Document {i+1}:\n{doc.page_content}")


🔹 Document 1:
history reflects cycles of prosperity and exploitation, creativity and struggle, unity and 
division. Whether in the streets of Kolkata or the villages of Bangladesh, Bengal remains 
alive with vibrant traditions, linguistic pride, and social dynamism. 
The story of Bengal is not merely regional; it is global. From Tagore’s poetry to the

🔹 Document 2:
Bengal. It explores Bengal’s history, geography, culture, economy, political 
developments, and the challenges faced in the modern era. The discussion emphasizes 
Bengal’s global significance as a land that has nurtured poets, philosophers, 
revolutionaries, and reformers while also enduring famines, partitions, and struggles for 
identity. 
 
Historical Background 
Ancient Bengal

🔹 Document 3:
Bengal: A Historical, Cultural, and Socio-Economic Study 
Introduction 
Bengal, one of the most historically significant regions of South Asia, is today divided 
into the Indian state of West Bengal and the sovereign nation of Bang

### RAG Pipeline with hybrid retriever

In [None]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate

In [27]:
from dotenv import load_dotenv
load_dotenv()
import os

In [26]:
from langchain.chat_models import init_chat_model

# Example 1: using init_chat_model
llm = init_chat_model(
    "gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0.2,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)
res = llm.invoke("Hello, how are you today?")
print(res.content)

Hello! I'm doing well, thank you for asking. How are you today?


In [30]:
#advance RAG pipeline with hybrid retriever and history awareness
from langchain.chains import create_history_aware_retriever
from langchain_core.messages import HumanMessage, AIMessage
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

In [31]:
## create a prompt that includes the chat history
contextualize_q_system_prompt = """Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

In [32]:
## create history aware retriever
history_aware_retriever = create_history_aware_retriever(
    llm, hybrid_retriever, contextualize_q_prompt
)
history_aware_retriever

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021E66B0B0D0>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x0000021E151C5090>, k=3)], weights=[0.5, 0.5]))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annot

In [None]:

from langchain.schema.runnable import RunnablePassthrough
# --------------------------
# Question-Answer Prompt
# --------------------------
qa_system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context: {context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", qa_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}")
])


In [48]:
from langchain_core.runnables import RunnablePassthrough

# Helper to format retrieved docs
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# Correct mapping for conversational RAG
conversational_rag_chain = (
    {
        "context": history_aware_retriever | format_docs,
        "chat_history": (lambda x: x["chat_history"]),  # ensure only list is passed
        "input": (lambda x: x["input"])
    }
    | qa_prompt  # your ChatPromptTemplate
    | llm
    | StrOutputParser()
)


In [49]:
chat_history = []

print("🤖 Conversational RAG Assistant (type 'exit' to quit)\n")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit", "bye"]:
        print("Assistant: Goodbye! 👋")
        break

    result = conversational_rag_chain.invoke({
        "chat_history": chat_history,
        "input": user_input
    })

    print(f"Assistant: {result}\n")

    # update history
    chat_history.extend([
        HumanMessage(content=user_input),
        AIMessage(content=result)
    ])


🤖 Conversational RAG Assistant (type 'exit' to quit)

Assistant: Hello! How can I help you today?

Assistant: Bengal has a rich and complex history, with its earliest references appearing in ancient Indian texts and archaeological evidence suggesting urban settlements from the 4th century BCE. For centuries, it has been a site of conflict, colonial exploitation, intellectual renaissance, and political transformation, enduring famines, partitions, and struggles for identity. Today, the region is divided into the Indian state of West Bengal and the sovereign nation of Bangladesh, known for nurturing poets, philosophers, revolutionaries, and reformers.

Assistant: The provided context mentions that Calcutta (now Kolkata) became the capital of British India until 1911. However, it does not explicitly state the capital of the Indian state of West Bengal in its current form.

Assistant: Goodbye! 👋
