In [3]:
import os
import re
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone_text.sparse import BM25Encoder
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Preprocess text: normalize whitespace, remove special characters, etc.
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

# Set up Pinecone
api_key = "pcsk_...."  # Replace with your Pinecone API key
index_name = "hybrid-search-langchain-pinecone"
pc = Pinecone(api_key=api_key)

# Recreate index if it exists to ensure a clean state
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=384,
    metric='dotproduct',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)
index = pc.Index(index_name)

# Set up HuggingFace embeddings
os.environ['HF_TOKEN'] = "hf_....."  # Replace with your Hugging Face token
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Set up BM25 sparse encoder
bm25_encoder = BM25Encoder().default()

# Load text files from a folder
folder_path = "New folder/"  # Replace with the path to your folder
texts = []

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path) and file_name.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()
            processed_text = preprocess_text(raw_text)
            if processed_text:  # Ensure non-empty text
                texts.append(processed_text)

# Fit the BM25 encoder on the processed texts
bm25_encoder.fit(texts)
bm25_encoder.dump('bm25_values.json')  # Save for reproducibility
bm25_encoder = BM25Encoder().load("bm25_values.json")

# Validate sparse encoding and filter valid texts
valid_texts = []
for text in texts:
    sparse_vector = bm25_encoder.encode_documents(text)
    if sparse_vector["indices"]:  # Check for valid sparse representation
        valid_texts.append(text)
    else:
        print(f"Skipping invalid text: {text}")

# Create the PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)

# Add valid texts to the retriever
retriever.add_texts(valid_texts)

# Query the retriever
query = "What genre was her novel?"
result = retriever.invoke(query)
print(result)


100% [........................................................................] 65406227 / 65406227

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[]


In [7]:
query = "Sports"
result = retriever.invoke(query)
print(result)

[Document(metadata={'score': 0.601547}, page_content='Sports 1 Football is the most popular sport in the world 2 The Olympics showcase talent from various countries 3 Cricket is widely followed in countries like India Australia and England 4 Serena Williams is considered one of the greatest tennis players of all time'), Document(metadata={'score': 0.021081198}, page_content='Novels 1 Pride and Prejudice by Jane Austen is a classic romantic novel 2 George Orwells 1984 explores the dangers of totalitarianism 3 JK Rowlings Harry Potter series is beloved by readers of all ages 4 The Great Gatsby by F Scott Fitzgerald captures the essence of the Jazz Age')]


In [9]:
query = "What books are mentioned"
result = retriever.invoke(query)
print(result)

[Document(metadata={'score': 0.259504795}, page_content='Novels 1 Pride and Prejudice by Jane Austen is a classic romantic novel 2 George Orwells 1984 explores the dangers of totalitarianism 3 JK Rowlings Harry Potter series is beloved by readers of all ages 4 The Great Gatsby by F Scott Fitzgerald captures the essence of the Jazz Age'), Document(metadata={'score': 0.0529573895}, page_content='Sports 1 Football is the most popular sport in the world 2 The Olympics showcase talent from various countries 3 Cricket is widely followed in countries like India Australia and England 4 Serena Williams is considered one of the greatest tennis players of all time')]
