## Setup and Import Libraries

In [13]:
import os
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import BM25Encoder
from dotenv import load_dotenv

In [9]:
load_dotenv()

True

In [10]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")

## Creating Index

In [6]:
index_name = "hybird-search-langchain"
pc = Pinecone()

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,     # Dimensionality of Dense Model
        metric='dotproduct',  # Sparse Values supported only for dotproduct
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [7]:
index = pc.Index(
    name=index_name
)

index

<pinecone.data.index.Index at 0x24be9d71610>

## HuggingFace Embeddings

In [12]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

## Sparse Matrix

In [14]:
bm25_encoder = BM25Encoder().default()

## Creating Sentences

In [15]:
sentences = [
    'Argentina won the 2022 Fifa World Cup',
    'France won the 2018 Fifa World Cup',
    'Germany won the 2014 Fifa World Cup'
]

In [16]:
bm25_encoder.fit(sentences)

100%|██████████| 3/3 [00:00<00:00, 82.14it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x24be9d5c8d0>

### Store Values to JSON File

In [18]:
bm25_encoder.dump('bm25_values.json')

### Loading BM25 Encoder

In [19]:
bm25_encoder = BM25Encoder().load('bm25_values.json')

## Creating Retriver

In [20]:
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    index=index
)

retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000024BC8387C10>, index=<pinecone.data.index.Index object at 0x0000024BE9D71610>)

## Adding Text

In [21]:
sentences = [
    'Argentina won the 2022 Fifa World Cup',
    'France won the 2018 Fifa World Cup',
    'Germany won the 2014 Fifa World Cup'
]

In [22]:
retriever.add_texts(texts=sentences)

100%|██████████| 1/1 [00:01<00:00,  1.74s/it]


## Asking Questions

In [23]:
retriever.invoke("Who won the 2014 Fifa World Cup")

[Document(metadata={'score': 0.655304551}, page_content='Germany won the 2014 Fifa World Cup'),
 Document(metadata={'score': 0.279299527}, page_content='Argentina won the 2022 Fifa World Cup'),
 Document(metadata={'score': 0.271083981}, page_content='France won the 2018 Fifa World Cup')]

In [24]:
retriever.invoke("Who won the most recent Fifa World Cup")

[Document(metadata={'score': 0.35877642}, page_content='Germany won the 2014 Fifa World Cup'),
 Document(metadata={'score': 0.32757172}, page_content='France won the 2018 Fifa World Cup'),
 Document(metadata={'score': 0.305027872}, page_content='Argentina won the 2022 Fifa World Cup')]