In [16]:
%pip install -q langchain langchain-community langchain-groq faiss-cpu rank_bm25 langchain-huggingface sentence-transformers

In [2]:
from IPython.display import display, Markdown

def print_docs(docs):
    for i, doc in enumerate(docs):
        display(Markdown(f"**DOCUMENT {i+1}**"))
        print(doc.page_content)
        print()

In [3]:
from langchain_core.documents import Document

sample_docs = [
    Document(
        page_content="LangChain is a framework for developing applications powered by large language models (LLMs).",
        metadata={"source": "tech_overview", "topic": "frameworks"}
    ),
    Document(
        page_content="BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query.",
        metadata={"source": "retrieval_docs", "topic": "algorithms"}
    ),
    Document(
        page_content="The Retriever-Augmented Generation (RAG) architecture combines retrieval systems with generative models to reduce hallucinations.",
        metadata={"source": "rag_theory", "topic": "architecture"}
    ),
    Document(
        page_content="Sparse retrievers like TF-IDF and BM25 rely on exact keyword matching and term frequency-inverse document frequency (TF-IDF) logic.",
        metadata={"source": "retrieval_docs", "topic": "sparse_retrieval"}
    ),
    Document(
        page_content="Dense retrievers use vector embeddings to represent semantic meaning, often using models like Sentence-BERT or OpenAI's text-embedding-3.",
        metadata={"source": "retrieval_docs", "topic": "dense_retrieval"}
    ),
    Document(
        page_content="Hybrid search combines the results of sparse and dense retrievers to leverage both keyword accuracy and semantic context.",
        metadata={"source": "hybrid_search_guide", "topic": "optimization"}
    ),
    Document(
        page_content="Vector databases like Pinecone, Milvus, and Weaviate are commonly used to store and query high-dimensional embeddings.",
        metadata={"source": "database_docs", "topic": "vector_stores"}
    ),
    Document(
        page_content="Prompt engineering involves crafting specific instructions to guide an LLM toward producing desired outputs.",
        metadata={"source": "prompt_guide", "topic": "prompt_engineering"}
    ),
    Document(
        page_content="RecursiveCharacterTextSplitter is a popular LangChain tool for breaking down long documents into smaller, manageable chunks.",
        metadata={"source": "preprocessing_docs", "topic": "chunking"}
    ),
    Document(
        page_content="Reranking is a post-retrieval step where a secondary model re-evaluates the relevance of the top-k retrieved documents.",
        metadata={"source": "optimization_docs", "topic": "reranking"}
    )
]

In [17]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(sample_docs)
bm25_retriever.k = 5

In [18]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_fn = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(sample_docs, embedding=embedding_fn)

In [6]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [7]:
from langchain_classic.retrievers import MergerRetriever

merger_retriever = MergerRetriever(
    retrievers=[bm25_retriever, vector_retriever]
)

In [8]:
from langchain_groq import ChatGroq
from google.colab import userdata

llm = ChatGroq(
    model="openai/gpt-oss-120b",
    api_key=userdata.get('GROQ_API_KEY')
)

In [9]:
from langchain_core.prompts import ChatPromptTemplate

query_prompt = ChatPromptTemplate.from_template(
    "Generate 3 different search queries separated by new lines to retrieve relevant documents for: {question}"
)

In [11]:
from langchain_core.output_parsers import StrOutputParser

sample_chain = (
    query_prompt
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

sample_chain.invoke({"question": "How does BM25 work with sparse retrievers?"})

['BM25 sparse retrieval mechanism and its integration with modern sparse retrievers  ',
 'How BM25 scoring is applied in sparse neural retrievers for information retrieval  ',
 'Combining traditional BM25 with sparse embedding models for document ranking']

In [12]:
from langchain_core.output_parsers import StrOutputParser

query_gen_chain = (
    query_prompt
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [13]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """
    Combines multiple lists of Documents into one re-ranked list.
    Formula: score = sum(1 / (rank + k))
    """
    fused_scores = {}

    for docs in results:
        for rank, doc in enumerate(docs):
            # We use page_content as a unique key (or doc.metadata['id'] if available)
            doc_content = doc.page_content
            if doc_content not in fused_scores:
                fused_scores[doc_content] = {"doc": doc, "score": 0}

            # Apply the RRF formula
            fused_scores[doc_content]["score"] += 1 / (rank + k)

    # Sort documents by their fused score in descending order
    reranked_results = sorted(
        fused_scores.values(),
        key=lambda x: x["score"],
        reverse=True
    )

    # Return only the Document objects
    return [item["doc"] for item in reranked_results]

In [14]:
from langchain_core.runnables import RunnablePassthrough

rag_fusion_chain = (
    {"question": RunnablePassthrough()}
    | query_gen_chain
    | merger_retriever.map()
    | reciprocal_rank_fusion
)

In [15]:
question = 'How does BM25 work with sparse retrievers?'

results = rag_fusion_chain.invoke({"question": question})
print_docs(results)

**DOCUMENT 1**

Sparse retrievers like TF-IDF and BM25 rely on exact keyword matching and term frequency-inverse document frequency (TF-IDF) logic.



**DOCUMENT 2**

Hybrid search combines the results of sparse and dense retrievers to leverage both keyword accuracy and semantic context.



**DOCUMENT 3**

BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query.



**DOCUMENT 4**

The Retriever-Augmented Generation (RAG) architecture combines retrieval systems with generative models to reduce hallucinations.



**DOCUMENT 5**

Reranking is a post-retrieval step where a secondary model re-evaluates the relevance of the top-k retrieved documents.



**DOCUMENT 6**

Dense retrievers use vector embeddings to represent semantic meaning, often using models like Sentence-BERT or OpenAI's text-embedding-3.



**DOCUMENT 7**

Vector databases like Pinecone, Milvus, and Weaviate are commonly used to store and query high-dimensional embeddings.



**DOCUMENT 8**

RecursiveCharacterTextSplitter is a popular LangChain tool for breaking down long documents into smaller, manageable chunks.



**DOCUMENT 9**

LangChain is a framework for developing applications powered by large language models (LLMs).

