#### Tuesday, December 19, 2023

[Advanced RAG 03 - Hybrid Search BM25 & Ensembles](https://www.youtube.com/watch?v=lYxGYXjfrNI&t=5s)

https://colab.research.google.com/drive/1lsT1V_U1Gq-jv09wv0ok5QHdyRjJyNxm?usp=sharing

This uses OpenAI: $4.64 / $38.00

This does not use the local GPU.

This all runs.

In [None]:
!pip -q install langchain huggingface_hub openai google-search-results tiktoken chromadb rank_bm25 faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
from getpass import getpass

# enter your api key
OPENAI_API_KEY = getpass("Enter your API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:
!pip show langchain

Name: langchain
Version: 0.0.350
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: None
Author-email: None
License: MIT
Location: /usr/local/lib/python3.8/dist-packages
Requires: PyYAML, numpy, SQLAlchemy, requests, tenacity, langchain-community, dataclasses-json, langchain-core, jsonpatch, aiohttp, langsmith, pydantic, async-timeout
Required-by: 


# Hybrid Search

## BM25 Retriever - Sparse retriever

In [4]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()


In [5]:
doc_list = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I like computers by Apple",
    "I love fruit juice"
]

In [6]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 2

In [7]:
bm25_retriever.get_relevant_documents("Apple")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice')]

In [8]:
bm25_retriever.get_relevant_documents("a green fruit")

[Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple')]

In [9]:
bm25_retriever.dict

<bound method BaseModel.dict of BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7fc144046b20>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are fruits'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')], k=2)>

## Embeddings - Dense retrievers FAISS

In [10]:
faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

In [11]:
faiss_retriever.get_relevant_documents("A green fruit")

[Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I like apples')]

## Ensemble Retriever

In [12]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                       weights=[0.5, 0.5])

In [13]:
docs = ensemble_retriever.get_relevant_documents("A green fruit")
docs

[Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple'),
 Document(page_content='I like apples')]

In [14]:
docs = ensemble_retriever.get_relevant_documents("Apple Phones")
docs

[Document(page_content='I like computers by Apple'),
 Document(page_content='I like apples'),
 Document(page_content='I love fruit juice')]