# Compare Vector Search vs Hybrid Search

This notebook demonstrates the differences between pure vector (dense) search and hybrid search that combines dense and sparse vectors.

## Setup

In [None]:
import os
import sys
import json
import requests
import numpy as np
from dotenv import load_dotenv
from typing import List, Dict, Tuple
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import time

# Add parent directory to path
sys.path.append('..')
from pgvector_rag import PGVectorRAG

# Load environment variables
load_dotenv('../.env')

# Set up plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Initialize connections
conn_params = {
    "host": os.getenv('DB_HOST', 'postgres-pgvector.pgvector.svc.cluster.local'),
    "port": int(os.getenv('DB_PORT', '5432')),
    "database": os.getenv('DB_NAME', 'vectordb'),
    "user": os.getenv('DB_USER', 'vectoruser'),
    "password": os.getenv('DB_PASSWORD', 'vectorpass')
}

# API configurations
NOMIC_URL = os.getenv('NOMIC_EMBED_URL')
if NOMIC_URL and not NOMIC_URL.endswith('/v1'):
    NOMIC_URL = f"{NOMIC_URL}/v1"
NOMIC_API_KEY = os.getenv('NOMIC_EMBED_API_KEY')
NOMIC_MODEL = os.getenv('NOMIC_EMBED_MODEL_NAME')

# Initialize RAG client
rag = PGVectorRAG(conn_params)
PROJECT_ID = os.getenv('PROJECT_ID', 'demo_project')

print("Setup complete")

In [None]:
# Helper functions
def get_embedding(text: str) -> np.ndarray:
    """Get dense embedding for text"""
    response = requests.post(
        f"{NOMIC_URL}/embeddings",
        headers={
            'Authorization': f"Bearer {NOMIC_API_KEY}",
            'Content-Type': 'application/json'
        },
        json={
            'model': NOMIC_MODEL,
            'input': text
        }
    )
    
    if response.status_code == 200:
        data = response.json()
        return np.array(data['data'][0]['embedding'])
    else:
        raise Exception(f"Error getting embedding: {response.status_code}")

def get_sparse_embedding(text: str) -> Dict[int, float]:
    """
    Generate sparse embedding (mock implementation).
    In production, use SPLADE, BM25, or similar.
    """
    # Simple TF-IDF-like approach for demonstration
    words = text.lower().split()
    word_counts = Counter(words)
    
    # Convert to sparse vector format
    sparse_vec = {}
    for word, count in word_counts.items():
        # Simple hash to get "token ID"
        token_id = abs(hash(word)) % 30000
        # TF-IDF-like weight (simplified)
        weight = count * (1.0 / len(words))
        sparse_vec[token_id] = weight
    
    return sparse_vec

## Prepare Test Data

In [None]:
# First, let's add some test documents with both dense and sparse embeddings
test_documents = [
    {
        "text": "PGVector is a PostgreSQL extension for vector similarity search. It supports multiple distance metrics including L2, inner product, and cosine distance.",
        "topic": "database"
    },
    {
        "text": "Vector databases enable semantic search by storing and querying high-dimensional embeddings generated by machine learning models.",
        "topic": "ml"
    },
    {
        "text": "The cosine similarity metric measures the angle between two vectors, making it useful for comparing document embeddings regardless of their magnitude.",
        "topic": "math"
    },
    {
        "text": "PostgreSQL provides robust support for JSON data types and full-text search capabilities, making it versatile for modern applications.",
        "topic": "database"
    },
    {
        "text": "Hybrid search combines dense vector search with sparse retrieval methods like BM25 to improve search relevance and recall.",
        "topic": "search"
    }
]

print(f"Preparing {len(test_documents)} test documents...")

In [None]:
# Add test documents to database (if not already present)
import uuid

test_doc_id = str(uuid.uuid4())

for idx, doc in enumerate(test_documents):
    dense_emb = get_embedding(doc['text'])
    sparse_emb = get_sparse_embedding(doc['text'])
    
    chunk_id = rag.add_document_chunk(
        project_id=PROJECT_ID,
        document_id=test_doc_id,
        document_name="comparison_test_doc",
        chunk_text=doc['text'],
        chunk_index=idx,
        dense_embedding=dense_emb,
        sparse_embedding=sparse_emb,
        topic=doc['topic'],
        metadata={"test_type": "comparison"}
    )

print("Test documents added successfully")

## Compare Search Methods

In [None]:
# Test queries with different characteristics
test_queries = [
    {
        "query": "What is PGVector and PostgreSQL?",
        "type": "exact_match",
        "description": "Query with exact keyword matches"
    },
    {
        "query": "How do I store embeddings in a database for ML applications?",
        "type": "semantic",
        "description": "Semantic query without exact matches"
    },
    {
        "query": "cosine distance calculation vectors",
        "type": "keyword_heavy",
        "description": "Keyword-heavy query"
    },
    {
        "query": "What are the benefits of combining different search methods?",
        "type": "conceptual",
        "description": "Conceptual query about hybrid approaches"
    }
]

In [None]:
def compare_search_results(query: str, limit: int = 5) -> Dict:
    """
    Compare dense-only search with hybrid search
    """
    # Get embeddings
    dense_emb = get_embedding(query)
    sparse_emb = get_sparse_embedding(query)
    
    # Dense-only search
    start_time = time.time()
    dense_results = rag.dense_search(
        project_id=PROJECT_ID,
        query_embedding=dense_emb,
        limit=limit
    )
    dense_time = time.time() - start_time
    
    # Hybrid search
    start_time = time.time()
    hybrid_results = rag.hybrid_search(
        project_id=PROJECT_ID,
        query_dense=dense_emb,
        query_sparse=sparse_emb,
        limit=limit
    )
    hybrid_time = time.time() - start_time
    
    return {
        'query': query,
        'dense_results': dense_results,
        'hybrid_results': hybrid_results,
        'dense_time': dense_time,
        'hybrid_time': hybrid_time
    }

In [None]:
# Run comparisons
comparison_results = []

for test_query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {test_query['query']}")
    print(f"Type: {test_query['type']} - {test_query['description']}")
    print(f"{'='*80}")
    
    results = compare_search_results(test_query['query'])
    results['query_type'] = test_query['type']
    comparison_results.append(results)
    
    # Show top 3 results from each method
    print("\nDENSE SEARCH RESULTS:")
    for i, res in enumerate(results['dense_results'][:3]):
        print(f"{i+1}. [{res['distance']:.4f}] {res['chunk_text'][:100]}...")
    
    print("\nHYBRID SEARCH RESULTS:")
    for i, res in enumerate(results['hybrid_results'][:3]):
        print(f"{i+1}. [RRF: {res['rrf_score']:.4f}] {res['chunk_text'][:100]}...")
    
    print(f"\nTiming: Dense={results['dense_time']:.3f}s, Hybrid={results['hybrid_time']:.3f}s")

## Analyze Differences

In [None]:
# Calculate overlap between search methods
def calculate_overlap(dense_results: List, hybrid_results: List) -> float:
    """Calculate percentage of overlapping results"""
    dense_ids = set(r['id'] for r in dense_results)
    hybrid_ids = set(r['id'] for r in hybrid_results)
    
    overlap = len(dense_ids.intersection(hybrid_ids))
    total_unique = len(dense_ids.union(hybrid_ids))
    
    return overlap / total_unique if total_unique > 0 else 0

# Analyze results
overlap_data = []
for result in comparison_results:
    overlap = calculate_overlap(result['dense_results'], result['hybrid_results'])
    overlap_data.append({
        'query_type': result['query_type'],
        'overlap': overlap,
        'query': result['query']
    })

# Create DataFrame
df_overlap = pd.DataFrame(overlap_data)
print("Result Overlap Analysis:")
print(df_overlap)

In [None]:
# Visualize overlap
plt.figure(figsize=(10, 6))
bars = plt.bar(df_overlap['query_type'], df_overlap['overlap'])
plt.ylabel('Overlap Percentage')
plt.xlabel('Query Type')
plt.title('Result Overlap: Dense vs Hybrid Search')
plt.ylim(0, 1)

# Add value labels on bars
for bar, overlap in zip(bars, df_overlap['overlap']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
             f'{overlap:.2%}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Compare search times
timing_data = []
for result in comparison_results:
    timing_data.append({
        'query_type': result['query_type'],
        'method': 'Dense',
        'time': result['dense_time']
    })
    timing_data.append({
        'query_type': result['query_type'],
        'method': 'Hybrid',
        'time': result['hybrid_time']
    })

df_timing = pd.DataFrame(timing_data)

# Plot timing comparison
plt.figure(figsize=(10, 6))
sns.barplot(data=df_timing, x='query_type', y='time', hue='method')
plt.ylabel('Search Time (seconds)')
plt.xlabel('Query Type')
plt.title('Search Performance: Dense vs Hybrid')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Deep Dive: Ranking Differences

In [None]:
# Analyze ranking differences for a specific query
analysis_query = "How do I store embeddings in a database for ML applications?"

print(f"Detailed Analysis for: '{analysis_query}'\n")

results = compare_search_results(analysis_query, limit=10)

# Create ranking comparison
dense_ranking = {r['id']: i+1 for i, r in enumerate(results['dense_results'])}
hybrid_ranking = {r['id']: i+1 for i, r in enumerate(results['hybrid_results'])}

# Find documents that appear in different positions
all_ids = set(dense_ranking.keys()).union(set(hybrid_ranking.keys()))

ranking_comparison = []
for doc_id in all_ids:
    dense_rank = dense_ranking.get(doc_id, 'Not in top 10')
    hybrid_rank = hybrid_ranking.get(doc_id, 'Not in top 10')
    
    # Get document text
    doc_text = None
    for r in results['dense_results'] + results['hybrid_results']:
        if r['id'] == doc_id:
            doc_text = r['chunk_text'][:100]
            break
    
    ranking_comparison.append({
        'doc_id': doc_id,
        'text_preview': doc_text,
        'dense_rank': dense_rank,
        'hybrid_rank': hybrid_rank
    })

# Display ranking comparison
df_ranking = pd.DataFrame(ranking_comparison)
df_ranking = df_ranking.sort_values('hybrid_rank')
print(df_ranking.to_string(index=False))

## Key Insights

In [None]:
# Summarize key differences
print("KEY INSIGHTS:\n")

print("1. OVERLAP ANALYSIS:")
avg_overlap = df_overlap['overlap'].mean()
print(f"   - Average overlap between methods: {avg_overlap:.1%}")
print(f"   - Lowest overlap for '{df_overlap.loc[df_overlap['overlap'].idxmin(), 'query_type']}' queries")
print(f"   - Highest overlap for '{df_overlap.loc[df_overlap['overlap'].idxmax(), 'query_type']}' queries")

print("\n2. PERFORMANCE:")
avg_dense_time = df_timing[df_timing['method'] == 'Dense']['time'].mean()
avg_hybrid_time = df_timing[df_timing['method'] == 'Hybrid']['time'].mean()
print(f"   - Average dense search time: {avg_dense_time:.3f}s")
print(f"   - Average hybrid search time: {avg_hybrid_time:.3f}s")
print(f"   - Hybrid is {avg_hybrid_time/avg_dense_time:.1f}x slower")

print("\n3. WHEN TO USE EACH METHOD:")
print("   - Dense Search: Best for semantic similarity, conceptual queries")
print("   - Hybrid Search: Best for keyword-heavy queries, exact matches, comprehensive recall")

## Interactive Comparison Tool

In [None]:
def interactive_compare(query: str):
    """
    Interactive tool to compare search methods
    """
    results = compare_search_results(query, limit=5)
    
    # Create side-by-side comparison
    print(f"\nQuery: '{query}'\n")
    print("="*120)
    print(f"{'DENSE SEARCH':<60} | {'HYBRID SEARCH':<60}")
    print("="*120)
    
    for i in range(5):
        # Dense result
        if i < len(results['dense_results']):
            d = results['dense_results'][i]
            dense_text = f"{i+1}. [{d['distance']:.3f}] {d['chunk_text'][:50]}..."
        else:
            dense_text = ""
        
        # Hybrid result
        if i < len(results['hybrid_results']):
            h = results['hybrid_results'][i]
            hybrid_text = f"{i+1}. [RRF:{h['rrf_score']:.3f}] {h['chunk_text'][:50]}..."
        else:
            hybrid_text = ""
        
        print(f"{dense_text:<60} | {hybrid_text:<60}")
    
    print("\nMetrics:")
    print(f"Search time: Dense={results['dense_time']:.3f}s, Hybrid={results['hybrid_time']:.3f}s")
    
    overlap = calculate_overlap(results['dense_results'], results['hybrid_results'])
    print(f"Result overlap: {overlap:.1%}")

In [None]:
# Try your own queries
interactive_compare("What are vector databases?")

In [None]:
# Another example
interactive_compare("PostgreSQL JSON full-text search")

## Cleanup

In [None]:
# Close database connection
rag.close()
print("Database connection closed")