! pip install langchain --q
! pip install chromadb --q
Best Match 25 - ranking function used by search engines to estimate the relevance of documents to a given search query.
! pip install rank_bm25 --q     # Slight modification  TF_IDF algorithm
! pip install sentence_transformers lark --quiet # for creating embeddings

In [1]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import Chroma


In [2]:
doc_list = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I am still figuring out how to use Apple's macbook",
    "I don't have Apple's iPhone",
    "Macbook is very smooth and swift"
]


BM25 Retriever keyword retriever

In [3]:
# initialize the bm25 retriever 
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 2


In [4]:
bm25_retriever.get_relevant_documents("Apple")


[Document(page_content='Macbook is very smooth and swift'),
 Document(page_content="I don't have Apple's iPhone")]

In [5]:
bm25_retriever.get_relevant_documents("orange")


[Document(page_content='Macbook is very smooth and swift'),
 Document(page_content="I don't have Apple's iPhone")]

BGE Embedding

In [6]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)


In [7]:
from langchain.vectorstores import Chroma

# load embeddings into Chroma - need to pass docs , embedding function and path of the db

db = Chroma.from_texts(doc_list,
                       embedding=embeddings)


In [8]:
db_retriever = db.as_retriever(search_kwargs={"k": 2})

db_retriever.get_relevant_documents("orange")


[Document(page_content='I like oranges'),
 Document(page_content='Apples and oranges are fruits')]

In [9]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, 
                                                   db_retriever],
                                       weights=[0.4, 0.6])


In [10]:
docs = ensemble_retriever.get_relevant_documents("How is macbook?")
docs


[Document(page_content='Macbook is very smooth and swift'),
 Document(page_content="I am still figuring out how to use Apple's macbook"),
 Document(page_content="I don't have Apple's iPhone")]

In [11]:
docs = ensemble_retriever.get_relevant_documents("iPhone")
docs


[Document(page_content="I don't have Apple's iPhone"),
 Document(page_content='I like apples'),
 Document(page_content='Macbook is very smooth and swift')]

In [12]:
docs = ensemble_retriever.get_relevant_documents("orange")
docs


[Document(page_content='I like oranges'),
 Document(page_content='Apples and oranges are fruits'),
 Document(page_content='Macbook is very smooth and swift'),
 Document(page_content="I don't have Apple's iPhone")]