In [1]:
import os
from dotenv import load_dotenv
load_dotenv()   
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [3]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import ServerlessSpec,Pinecone
index_name = "hybrid-search-langchain"
pc = Pinecone(pinecone_api_key=pinecone_api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws",region="us-east-1"),
    )


In [4]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x105e5f9d0>

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [5]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder().default()


In [9]:
sentences = [
"The langchain_huggingface library simplifies natural language processing tasks. It integrates Hugging Face's transformers with Langchain's functionality. This combination enables efficient embedding, classification, and generation. Developers can leverage pre-trained models for various applications. The library supports multiple models and architectures. Its flexibility accelerates NLP development.",
"Hugging Face's transformers revolutionized NLP capabilities. These models learn contextual relationships within text. Pre-trained models like BERT and MiniLM facilitate downstream tasks. Fine-tuning enables adaptation to specific domains. Transformers' attention mechanism improves upon traditional recurrent neural networks. This architecture boosts performance in language understanding and generation.",
"NLP applications require high-quality embeddings. Sentence transformers provide dense vector representations. These embeddings capture semantic meaning and context. They enable semantic search, clustering, and text classification. Efficient computation and compact representations make them suitable for large-scale applications. Sentence transformers empower developers to build robust NLP systems.",
"The 'all-MiniLM-L6-v2' model excels at sentence embeddings. Its compact size ensures efficient computation. This model's performance rivals larger counterparts. Its suitability for various NLP tasks makes it a popular choice. Developers leverage this model for semantic search, text classification, and clustering. Its reliability enhances downstream applications.",
"Langchain_huggingface streamlines embedding generation. It abstracts underlying complexities, allowing developers to focus on application logic. This library supports batch processing and multiple input formats. Its flexibility accommodates diverse use cases. By integrating Hugging Face's transformers, langchain_huggingface accelerates NLP development. Efficient embedding generation empowers innovative applications.",
"Developers can explore alternative models within langchain_huggingface. Each model offers unique strengths and trade-offs. 'bert-base-nli-mean-tokens' and 'distilbert-base-nli-mean-tokens' provide additional options. These models cater to specific requirements. By selecting the optimal model, developers can fine-tune their NLP applications for enhanced performance and accuracy."
]

bm25.fit(sentences)

bm25.dump("bm25_val.json")

100%|██████████| 6/6 [00:00<00:00, 164.26it/s]


In [11]:
bm25_encoder = BM25Encoder().load("bm25_val.json")

In [12]:
retriever = PineconeHybridSearchRetriever(
    embeddings=embedding,
    sparse_encoder=bm25_encoder,
    index=index
)

In [13]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


In [14]:
retriever.invoke("primary function of the langchain_huggingface library")

[Document(metadata={'score': 0.5075863}, page_content="The langchain_huggingface library simplifies natural language processing tasks. It integrates Hugging Face's transformers with Langchain's functionality. This combination enables efficient embedding, classification, and generation. Developers can leverage pre-trained models for various applications. The library supports multiple models and architectures. Its flexibility accelerates NLP development."),
 Document(metadata={'score': 0.372083366}, page_content="Langchain_huggingface streamlines embedding generation. It abstracts underlying complexities, allowing developers to focus on application logic. This library supports batch processing and multiple input formats. Its flexibility accommodates diverse use cases. By integrating Hugging Face's transformers, langchain_huggingface accelerates NLP development. Efficient embedding generation empowers innovative applications."),
 Document(metadata={'score': 0.305434108}, page_content="Dev