In [6]:
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker

embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') ##embedding model

In [20]:
##loading docs##
dir_loader = DirectoryLoader(
    path='data',
    glob='*.txt',
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'}
)
docs = dir_loader.load()
print(f"Number of docs loaded:{len(docs)}")
print(f"Doc Page content: {docs[0].page_content[0:100]}")

##using recursive tic chunker for better chunks##
chunker = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    separators=['\n'," "]
)
doc_chunks = chunker.split_documents(documents=docs)
print(f"Number of chunks loaded: {len(doc_chunks)}")
print(f"Chunks Page content: {doc_chunks[0].page_content[0:100]}")


Number of docs loaded:2
Doc Page content: LangChain is a framework for building applications with LLMs.
Langchain provides modular abstraction
Number of chunks loaded: 25
Chunks Page content: LangChain is a framework for building applications with LLMs.


In [25]:
## create the FAISS vectorstore ##
vector_store = FAISS.from_documents(
    documents=doc_chunks,
    embedding=embedding_model
)
dense_retriever = vector_store.as_retriever(search_kwargs={'k':3})

### create the BM25 retreiver ###
sparse_retreiver = BM25Retriever.from_documents(doc_chunks)
sparse_retreiver.k = 3

##Ensemble retreiver (weighted)
combined_retreiver = EnsembleRetriever(
    retrievers=[dense_retriever,sparse_retreiver],
    weights=[0.80,0.20]
)

In [23]:
query = 'Is Mumbai a good city to live in?'
results = combined_retreiver.invoke(query)

results

[Document(id='7ab02adc-6f09-4405-a3ce-aab84226c7a7', metadata={'source': 'data\\transformer_india_mumbai.txt'}, page_content='Despite challenges like overcrowding and high living costs, Mumbai remains a land of opportunities'),
 Document(id='a34994cf-64e8-4bf8-9768-c69826a901b0', metadata={'source': 'data\\transformer_india_mumbai.txt'}, page_content='Mumbai, Indiaâ€™s financial capital, is a city that never sleeps.'),
 Document(id='ccbc3760-f261-4675-9b83-51ffcee0fc02', metadata={'source': 'data\\transformer_india_mumbai.txt'}, page_content='Its diverse culture, fast-paced lifestyle, and entrepreneurial spirit make Mumbai the beating heart'),
 Document(metadata={'source': 'data\\transformer_india_mumbai.txt'}, page_content='to lead in the digital era.')]