In [10]:
%pip install -q langchain langchain-community faiss-cpu rank_bm25 langchain-huggingface sentence-transformers

In [2]:
from IPython.display import display, Markdown

def print_docs(docs):
    for i, doc in enumerate(docs):
        display(Markdown(f"**DOCUMENT {i+1}**"))
        print(doc.page_content)
        print()

In [3]:
from langchain_core.documents import Document

sample_docs = [
    Document(
        page_content="LangChain is a framework for developing applications powered by large language models (LLMs).",
        metadata={"source": "tech_overview", "topic": "frameworks"}
    ),
    Document(
        page_content="BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query.",
        metadata={"source": "retrieval_docs", "topic": "algorithms"}
    ),
    Document(
        page_content="The Retriever-Augmented Generation (RAG) architecture combines retrieval systems with generative models to reduce hallucinations.",
        metadata={"source": "rag_theory", "topic": "architecture"}
    ),
    Document(
        page_content="Sparse retrievers like TF-IDF and BM25 rely on exact keyword matching and term frequency-inverse document frequency (TF-IDF) logic.",
        metadata={"source": "retrieval_docs", "topic": "sparse_retrieval"}
    ),
    Document(
        page_content="Dense retrievers use vector embeddings to represent semantic meaning, often using models like Sentence-BERT or OpenAI's text-embedding-3.",
        metadata={"source": "retrieval_docs", "topic": "dense_retrieval"}
    ),
    Document(
        page_content="Hybrid search combines the results of sparse and dense retrievers to leverage both keyword accuracy and semantic context.",
        metadata={"source": "hybrid_search_guide", "topic": "optimization"}
    ),
    Document(
        page_content="Vector databases like Pinecone, Milvus, and Weaviate are commonly used to store and query high-dimensional embeddings.",
        metadata={"source": "database_docs", "topic": "vector_stores"}
    ),
    Document(
        page_content="Prompt engineering involves crafting specific instructions to guide an LLM toward producing desired outputs.",
        metadata={"source": "prompt_guide", "topic": "prompt_engineering"}
    ),
    Document(
        page_content="RecursiveCharacterTextSplitter is a popular LangChain tool for breaking down long documents into smaller, manageable chunks.",
        metadata={"source": "preprocessing_docs", "topic": "chunking"}
    ),
    Document(
        page_content="Reranking is a post-retrieval step where a secondary model re-evaluates the relevance of the top-k retrieved documents.",
        metadata={"source": "optimization_docs", "topic": "reranking"}
    )
]

In [4]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(sample_docs)
bm25_retriever.k = 5

In [11]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_fn = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(sample_docs, embedding=embedding_fn)

In [6]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [8]:
from langchain_classic.retrievers import EnsembleRetriever

retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.35, 0.65]
)

In [9]:
query = 'How does BM25 work with sparse retrievers?'

results = retriever.invoke(query)
print_docs(results)

**DOCUMENT 1**

Sparse retrievers like TF-IDF and BM25 rely on exact keyword matching and term frequency-inverse document frequency (TF-IDF) logic.



**DOCUMENT 2**

Hybrid search combines the results of sparse and dense retrievers to leverage both keyword accuracy and semantic context.



**DOCUMENT 3**

BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query.



**DOCUMENT 4**

Dense retrievers use vector embeddings to represent semantic meaning, often using models like Sentence-BERT or OpenAI's text-embedding-3.



**DOCUMENT 5**

Reranking is a post-retrieval step where a secondary model re-evaluates the relevance of the top-k retrieved documents.



**DOCUMENT 6**

The Retriever-Augmented Generation (RAG) architecture combines retrieval systems with generative models to reduce hallucinations.



**DOCUMENT 7**

RecursiveCharacterTextSplitter is a popular LangChain tool for breaking down long documents into smaller, manageable chunks.

