## Hybrid Retriever- Combining Dense And Sparse Retriever

In [3]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document
from langchain_community.retrievers import BM25Retriever

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)

In [None]:
# Step1: Loaders
print("\n PyMuPDFLoader")
try:
    loader = PyMuPDFLoader("bengal.pdf")
    docs = loader.load()

    print(f"  Loaded {len(docs)} pages")
    print(f"  Includes detailed metadata")
    print(docs)
except Exception as e:
    print(f"  Error: {e}")


 PyMuPDFLoader
  Loaded 5 pages
  Includes detailed metadata
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-14T11:37:04+05:30', 'source': 'bengal.pdf', 'file_path': 'bengal.pdf', 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': 'Saikat Santra', 'subject': '', 'keywords': '', 'moddate': '2025-09-14T11:37:04+05:30', 'trapped': '', 'modDate': "D:20250914113704+05'30'", 'creationDate': "D:20250914113704+05'30'", 'page': 0}, page_content='Bengal: A Historical, Cultural, and Socio-Economic Study \nIntroduction \nBengal, one of the most historically significant regions of South Asia, is today divided \ninto the Indian state of West Bengal and the sovereign nation of Bangladesh. With a \ncombined population of over 250 million people, Bengal represents one of the most \ndensely populated and culturally vibrant areas of the world. For centuries, the region \nhas been known as a land of a

In [16]:
#step 2 : Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,    # size of each chunk
    chunk_overlap=80   # overlap to maintain context
)

# Split documents into chunks
docs = text_splitter.split_documents(docs)
print(f"Total chunks: {len(docs)}")

Total chunks: 31


In [17]:
for i, chunk in enumerate(docs[:2]):  # Display first 2 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(f"Content: {chunk.page_content[:200]}...")  # Print first 200 characters
    print(f"Metadata: {chunk.metadata}")


--- Chunk 1 ---
Content: Bengal: A Historical, Cultural, and Socio-Economic Study 
Introduction 
Bengal, one of the most historically significant regions of South Asia, is today divided 
into the Indian state of West Bengal a...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-14T11:37:04+05:30', 'source': 'bengal.pdf', 'file_path': 'bengal.pdf', 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': 'Saikat Santra', 'subject': '', 'keywords': '', 'moddate': '2025-09-14T11:37:04+05:30', 'trapped': '', 'modDate': "D:20250914113704+05'30'", 'creationDate': "D:20250914113704+05'30'", 'page': 0}

--- Chunk 2 ---
Content: densely populated and culturally vibrant areas of the world. For centuries, the region 
has been known as a land of abundance, thanks to the fertility of the Ganges-
Brahmaputra delta. It has also bee...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microso

In [18]:
# Step 3: Dense Retriever (FAISS + HuggingFace)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
dense_vectorstore = FAISS.from_documents(docs, embedding_model)
dense_retriever = dense_vectorstore.as_retriever()

In [20]:
### Sparse Retriever(BM25)
sparse_retriever=BM25Retriever.from_documents(docs)
sparse_retriever.k=3 ##top- k documents to retriever

## step 4 : Combine with Ensemble Retriever
hybrid_retriever=EnsembleRetriever(
    retrievers=[dense_retriever,sparse_retriever],
    weight=[0.7,0.3]
)


In [21]:
hybrid_retriever


EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021E66B0B0D0>, search_kwargs={}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x0000021E151C5090>, k=3)], weights=[0.5, 0.5])

In [22]:
# Step 5: Query and get results
query = "what is history of bengal?"
results = hybrid_retriever.invoke(query)

# Step 6: Print results
for i, doc in enumerate(results):
    print(f"\n🔹 Document {i+1}:\n{doc.page_content}")


🔹 Document 1:
history reflects cycles of prosperity and exploitation, creativity and struggle, unity and 
division. Whether in the streets of Kolkata or the villages of Bangladesh, Bengal remains 
alive with vibrant traditions, linguistic pride, and social dynamism. 
The story of Bengal is not merely regional; it is global. From Tagore’s poetry to the

🔹 Document 2:
Bengal. It explores Bengal’s history, geography, culture, economy, political 
developments, and the challenges faced in the modern era. The discussion emphasizes 
Bengal’s global significance as a land that has nurtured poets, philosophers, 
revolutionaries, and reformers while also enduring famines, partitions, and struggles for 
identity. 
 
Historical Background 
Ancient Bengal

🔹 Document 3:
Bengal: A Historical, Cultural, and Socio-Economic Study 
Introduction 
Bengal, one of the most historically significant regions of South Asia, is today divided 
into the Indian state of West Bengal and the sovereign nation of Bang