In [1]:
import numpy as np
from typing import List, Dict, Tuple, Optional
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

class HybridSearchRetriever:
    """
    A comprehensive retriever implementing:
    1. Dense Vector Search (semantic similarity)
    2. Sparse Keyword Search (BM25)
    3. Hybrid Search (combination of both)
    """
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the retriever with embedding model.
        
        Args:
            model_name: Sentence transformer model for embeddings
        """
        self.embedding_model = SentenceTransformer(model_name)
        self.documents = []
        self.document_embeddings = None
        self.bm25 = None
        self.index = None
        self.tokenized_corpus = None
        
    def preprocess_text(self, text: str) -> List[str]:
        """Simple text preprocessing for BM25 tokenization."""
        return text.lower().split()
    
    def index_documents(self, documents: List[str]):
        """
        Index documents for both vector and keyword search.
        
        Args:
            documents: List of text documents to index
        """
        self.documents = documents
        
        # 1. Create embeddings for dense vector search
        print("Creating embeddings for vector search...")
        self.document_embeddings = self.embedding_model.encode(
            documents, 
            convert_to_numpy=True,
            show_progress_bar=False
        )
        
        # Create FAISS index for efficient similarity search
        dimension = self.document_embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)  # L2 distance
        self.index.add(self.document_embeddings)
        
        # 2. Prepare BM25 for sparse keyword search
        print("Preparing BM25 for keyword search...")
        self.tokenized_corpus = [self.preprocess_text(doc) for doc in documents]
        self.bm25 = BM25Okapi(self.tokenized_corpus)
    
    def dense_vector_search(self, 
                           query: str, 
                           k: int = 5) -> List[Tuple[int, float, str]]:
        """
        Perform semantic search using vector embeddings.
        
        Args:
            query: Search query
            k: Number of results to return
            
        Returns:
            List of (index, score, document) tuples
        """
        # Encode query
        query_embedding = self.embedding_model.encode(
            [query], 
            convert_to_numpy=True
        )
        
        # Search in FAISS index
        distances, indices = self.index.search(query_embedding, k)
        
        # Convert L2 distance to similarity score (higher is better)
        # Using 1/(1+distance) to convert distance to similarity
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx != -1:  # Valid index
                similarity = 1 / (1 + distance)  # Convert distance to similarity
                results.append((int(idx), float(similarity), self.documents[idx]))
        
        return results
    
    def sparse_keyword_search(self, 
                             query: str, 
                             k: int = 5) -> List[Tuple[int, float, str]]:
        """
        Perform keyword-based search using BM25.
        
        Args:
            query: Search query
            k: Number of results to return
            
        Returns:
            List of (index, score, document) tuples
        """
        tokenized_query = self.preprocess_text(query)
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top-k indices
        top_k_indices = np.argsort(scores)[::-1][:k]
        
        results = []
        for idx in top_k_indices:
            if scores[idx] > 0:  # Only include documents with positive scores
                results.append((int(idx), float(scores[idx]), self.documents[idx]))
        
        return results
    
    def hybrid_search_early_fusion(self, 
                                  query: str, 
                                  k: int = 5,
                                  alpha: float = 0.5) -> List[Tuple[int, float, str]]:
        """
        Hybrid search using early fusion (combine scores before ranking).
        
        Args:
            query: Search query
            k: Number of results to return
            alpha: Weight for vector search (1-alpha for BM25)
                   alpha=1: pure vector search, alpha=0: pure BM25
            
        Returns:
            List of (index, score, document) tuples
        """
        # Get results from both methods
        vector_results = self.dense_vector_search(query, k * 2)
        bm25_results = self.sparse_keyword_search(query, k * 2)
        
        # Create dictionaries for scores
        vector_scores = {idx: score for idx, score, _ in vector_results}
        bm25_scores = {idx: score for idx, score, _ in bm25_results}
        
        # Normalize scores to [0, 1] range
        def normalize_scores(score_dict):
            if not score_dict:
                return {}
            values = np.array(list(score_dict.values()))
            if values.max() == values.min():
                return {k: 0.5 for k in score_dict.keys()}
            normalized = (values - values.min()) / (values.max() - values.min())
            return dict(zip(score_dict.keys(), normalized))
        
        norm_vector = normalize_scores(vector_scores)
        norm_bm25 = normalize_scores(bm25_scores)
        
        # Combine scores
        combined_scores = {}
        all_indices = set(vector_scores.keys()) | set(bm25_scores.keys())
        
        for idx in all_indices:
            vector_score = norm_vector.get(idx, 0)
            bm25_score = norm_bm25.get(idx, 0)
            combined_score = (alpha * vector_score) + ((1 - alpha) * bm25_score)
            combined_scores[idx] = combined_score
        
        # Sort by combined score
        sorted_indices = sorted(combined_scores.items(), 
                               key=lambda x: x[1], 
                               reverse=True)[:k]
        
        results = []
        for idx, score in sorted_indices:
            results.append((idx, score, self.documents[idx]))
        
        return results
    
    def hybrid_search_reranking(self, 
                               query: str, 
                               k: int = 5,
                               first_stage_k: int = 20) -> List[Tuple[int, float, str]]:
        """
        Hybrid search using re-ranking approach (BM25 first, then re-rank with vectors).
        
        Args:
            query: Search query
            k: Number of results to return
            first_stage_k: Number of initial BM25 results to re-rank
            
        Returns:
            List of (index, score, document) tuples
        """
        # Stage 1: Get initial results from BM25
        bm25_results = self.sparse_keyword_search(query, first_stage_k)
        
        if not bm25_results:
            return []
        
        # Stage 2: Re-rank using vector similarity
        bm25_indices = [idx for idx, _, _ in bm25_results]
        
        # Get embeddings for the BM25 results
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        result_embeddings = self.document_embeddings[bm25_indices]
        
        # Calculate cosine similarities
        # Cosine similarity = dot product of normalized vectors
        query_norm = query_embedding / np.linalg.norm(query_embedding)
        result_norms = result_embeddings / np.linalg.norm(result_embeddings, axis=1, keepdims=True)
        similarities = np.dot(result_norms, query_norm.T).flatten()
        
        # Combine scores: weighted average of BM25 and vector similarity
        bm25_scores = np.array([score for _, score, _ in bm25_results])
        
        # Normalize both scores
        bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
        similarities_norm = (similarities - similarities.min()) / (similarities.max() - similarities.min() + 1e-10)
        
        # Combine with equal weight (can be adjusted)
        combined_scores = 0.5 * bm25_norm + 0.5 * similarities_norm
        
        # Get top-k results
        top_k_indices = np.argsort(combined_scores)[::-1][:k]
        
        results = []
        for i in top_k_indices:
            idx = bm25_indices[i]
            score = combined_scores[i]
            results.append((idx, score, self.documents[idx]))
        
        return results
    
    def display_results(self, 
                       results: List[Tuple[int, float, str]], 
                       title: str):
        """Helper function to display search results."""
        print(f"\n{'='*60}")
        print(f"{title}")
        print(f"{'='*60}")
        for i, (idx, score, doc) in enumerate(results, 1):
            print(f"\nResult #{i} (Score: {score:.4f}, Index: {idx})")
            print(f"Document: {doc[:100]}..." if len(doc) > 100 else f"Document: {doc}")
        print()

# Example Usage
def main():
    # Sample documents
    documents = [
        "The quick brown fox jumps over the lazy dog.",
        "Machine learning is a subset of artificial intelligence.",
        "Artificial intelligence transforms modern healthcare.",
        "Dogs are loyal pets that enjoy playing fetch.",
        "Natural language processing enables computers to understand human language.",
        "Cats are independent animals that sleep most of the day.",
        "Deep learning uses neural networks with multiple layers.",
        "Python is a popular programming language for data science.",
        "Foxes are wild animals found in forests and urban areas.",
        "Healthcare AI applications improve patient diagnosis accuracy.",
        "Pet ownership has positive effects on mental health.",
        "Transformers architecture revolutionized natural language processing.",
        "Data scientists use Python for machine learning projects.",
        "Wildlife conservation efforts protect endangered species.",
        "Neural networks mimic the human brain's structure."
    ]
    
    # Initialize retriever
    retriever = HybridSearchRetriever()
    retriever.index_documents(documents)
    
    # Test queries
    queries = [
        "artificial intelligence in medicine",
        "pets and animals",
        "machine learning with Python"
    ]
    
    for query in queries:
        print(f"\n{'#'*60}")
        print(f"QUERY: '{query}'")
        print(f"{'#'*60}")
        
        # 1. Dense Vector Search (Semantic Similarity)
        vector_results = retriever.dense_vector_search(query, k=3)
        retriever.display_results(vector_results, "DENSE VECTOR SEARCH RESULTS")
        
        # 2. Sparse Keyword Search (BM25)
        bm25_results = retriever.sparse_keyword_search(query, k=3)
        retriever.display_results(bm25_results, "SPARSE KEYWORD SEARCH (BM25) RESULTS")
        
        # 3. Hybrid Search - Early Fusion
        hybrid_results = retriever.hybrid_search_early_fusion(query, k=3, alpha=0.5)
        retriever.display_results(hybrid_results, "HYBRID SEARCH - EARLY FUSION (α=0.5)")
        
        # 4. Hybrid Search - Re-ranking
        rerank_results = retriever.hybrid_search_reranking(query, k=3)
        retriever.display_results(rerank_results, "HYBRID SEARCH - RE-RANKING")

def compare_techniques():
    """Compare different retrieval techniques on specific queries."""
    
    # More focused documents
    tech_docs = [
        "Artificial intelligence helps doctors diagnose diseases faster.",
        "Machine learning algorithms analyze medical images for tumors.",
        "AI in healthcare improves patient outcomes and reduces costs.",
        "Doctors use AI tools for early cancer detection.",
        "Healthcare technology includes telemedicine and AI diagnostics.",
        "Medical AI must comply with strict privacy regulations.",
        "Deep learning models can predict patient readmission rates.",
        "Natural language processing extracts information from medical records.",
        "Robotic surgery assisted by AI increases precision.",
        "AI-powered chatbots provide basic medical advice to patients."
    ]
    
    retriever = HybridSearchRetriever()
    retriever.index_documents(tech_docs)
    
    query = "AI doctor diagnosis medical"
    
    print("\n" + "="*80)
    print("COMPARISON OF RETRIEVAL TECHNIQUES")
    print("="*80)
    print(f"Query: '{query}'")
    print(f"Document count: {len(tech_docs)}")
    
    # Get all results
    vector_all = retriever.dense_vector_search(query, k=len(tech_docs))
    bm25_all = retriever.sparse_keyword_search(query, k=len(tech_docs))
    
    print("\nTop 5 Results Comparison:")
    print("\n{:<5} {:<40} {:<10} {:<10}".format(
        "Rank", "Document Snippet", "Vector", "BM25"))
    print("-"*80)
    
    # Create dictionaries for easy lookup
    vector_dict = {idx: (score, doc) for idx, score, doc in vector_all}
    bm25_dict = {idx: (score, doc) for idx, score, doc in bm25_all}
    
    all_indices = set(vector_dict.keys()) | set(bm25_dict.keys())
    
    # Show each document's scores from both methods
    for idx in all_indices:
        vec_score = vector_dict.get(idx, (0, ""))[0]
        bm25_score = bm25_dict.get(idx, (0, ""))[0]
        doc_snippet = vector_dict.get(idx, (0, ""))[1]
        if doc_snippet == "":
            doc_snippet = bm25_dict.get(idx, (0, ""))[1]
        
        print("{:<5} {:<40} {:<10.4f} {:<10.4f}".format(
            idx, 
            doc_snippet[:40] + "...", 
            vec_score, 
            bm25_score
        ))

if __name__ == "__main__":
    # Run main example
    main()
    
    # Run comparison
    compare_techniques()

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 114.73it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Creating embeddings for vector search...
Preparing BM25 for keyword search...

############################################################
QUERY: 'artificial intelligence in medicine'
############################################################

DENSE VECTOR SEARCH RESULTS

Result #1 (Score: 0.6154, Index: 2)
Document: Artificial intelligence transforms modern healthcare.

Result #2 (Score: 0.5323, Index: 9)
Document: Healthcare AI applications improve patient diagnosis accuracy.

Result #3 (Score: 0.4651, Index: 1)
Document: Machine learning is a subset of artificial intelligence.


SPARSE KEYWORD SEARCH (BM25) RESULTS

Result #1 (Score: 4.7309, Index: 2)
Document: Artificial intelligence transforms modern healthcare.

Result #2 (Score: 2.0219, Index: 8)
Document: Foxes are wild animals found in forests and urban areas.

Result #3 (Score: 1.6736, Index: 1)
Document: Machine learning is a subset of artificial intelligence.


HYBRID SEARCH - EARLY FUSION (α=0.5)

Result #1 (Score: 1.00

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 105.29it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Creating embeddings for vector search...
Preparing BM25 for keyword search...

COMPARISON OF RETRIEVAL TECHNIQUES
Query: 'AI doctor diagnosis medical'
Document count: 10

Top 5 Results Comparison:

Rank  Document Snippet                         Vector     BM25      
--------------------------------------------------------------------------------
0     Artificial intelligence helps doctors di... 0.6569     0.0000    
1     Machine learning algorithms analyze medi... 0.4216     0.3635    
2     AI in healthcare improves patient outcom... 0.5735     0.0000    
3     Doctors use AI tools for early cancer de... 0.5594     0.0000    
4     Healthcare technology includes telemedic... 0.5548     0.0000    
5     Medical AI must comply with strict priva... 0.5145     0.3635    
6     Deep learning models can predict patient... 0.4184     0.0000    
7     Natural language processing extracts inf... 0.4624     0.3635    
8     Robotic surgery assisted by AI increases... 0.4912     0.0000    
9   