In [48]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional, Union
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Make sure we have the necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

class HybridSearchManager:
    """Manages hybrid search using Qdrant Query API."""
    
    def __init__(self, collection_name: str = "documentation"):
        self.client = QdrantClient(":memory:")  # In-memory database for testing
        self.collection_name = collection_name
        self.dense_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        self.vector_size = self.dense_model.get_sentence_embedding_dimension()
        self.corpus = []
        self.id_to_idx = {}
        
        # For sparse vectors
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.tfidf_vectorizer = None
    
    def preprocess_text(self, text: str) -> List[str]:
        """Tokenize, remove stopwords, and stem the text."""
        tokens = word_tokenize(text.lower())
        tokens = [self.stemmer.stem(token) for token in tokens 
                  if token.isalnum() and token not in self.stop_words]
        return tokens
    
    def initialize_collection(self):
        """Initialize the Qdrant collection with both dense and sparse vector support."""
        # Delete existing collection if it exists
        try:
            self.client.delete_collection(collection_name=self.collection_name)
        except Exception:
            pass
        
        # Create new collection with dense vector support
        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config={
                "dense": models.VectorParams(
                    size=self.vector_size,
                    distance=models.Distance.COSINE
                ),
                "sparse": models.VectorParams(
                    size=10000,  # Maximum sparse vector size
                    distance=models.Distance.COSINE  # Using COSINE instead of Dot
                )
            }
        )
        
        # Reset the corpus
        self.corpus = []
        self.id_to_idx = {}
    
    def create_sparse_vector_as_dense(self, text: str) -> List[float]:
        """Create a sparse vector as dense format for compatibility."""
        if self.tfidf_vectorizer is None:
            raise ValueError("TF-IDF vectorizer not initialized. Call compute_tfidf_vectorizer first.")
        
        # Preprocess the text
        preprocessed_text = ' '.join(self.preprocess_text(text))
        
        # Transform the text using the pre-trained TF-IDF vectorizer
        vector = self.tfidf_vectorizer.transform([preprocessed_text])
        
        # Convert to dense list with sparse values
        sparse_vector = [0.0] * self.tfidf_vectorizer.max_features
        for idx, value in zip(vector.indices, vector.data):
            sparse_vector[idx] = value
        
        # Normalize the vector for cosine similarity
        norm = np.linalg.norm(sparse_vector)
        if norm > 0:
            sparse_vector = [v / norm for v in sparse_vector]
        
        return sparse_vector
    
    def compute_tfidf_vectorizer(self, texts: List[str]):
        """Compute the TF-IDF vectorizer based on the corpus."""
        # Preprocess all texts
        preprocessed_texts = [' '.join(self.preprocess_text(text)) for text in texts]
        
        # Create and fit the TF-IDF vectorizer
        self.tfidf_vectorizer = TfidfVectorizer(max_features=10000)
        self.tfidf_vectorizer.fit(preprocessed_texts)
    
    def upsert_batch(self, id_offset: int, texts: List[str], payloads: List[Dict[str, Any]]):
        """Add documents to Qdrant with both dense and sparse vectors."""
        # Save texts in corpus
        for idx, text in enumerate(texts):
            corpus_idx = len(self.corpus)
            self.corpus.append(text)
            self.id_to_idx[idx + id_offset] = corpus_idx
        
        # Compute TF-IDF vectorizer if not already computed
        if self.tfidf_vectorizer is None:
            self.compute_tfidf_vectorizer(texts)
        
        # Compute dense embeddings
        dense_embeddings = self.dense_model.encode(texts, normalize_embeddings=True)
        
        # Prepare points for upserting
        points = []
        for idx, (text, payload, dense_embedding) in enumerate(zip(texts, payloads, dense_embeddings)):
            # Create sparse vector as dense format
            sparse_vector = self.create_sparse_vector_as_dense(text)
            
            # Create point with both vectors
            points.append(models.PointStruct(
                id=idx + id_offset,
                vector={
                    "dense": dense_embedding.tolist(),
                    "sparse": sparse_vector
                },
                payload=payload
            ))
        
        # Upsert points
        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )
    
    def hybrid_search(self, query: str, limit: int = 6, hybrid_method: str = "fusion") -> List[Dict[str, Any]]:
        """
        Perform hybrid search using Qdrant's Query API.
        
        Args:
            query: The search query
            limit: Number of results to return
            hybrid_method: Method to use for hybrid search.
                Options:
                - "fusion": Combine dense and sparse using RRF fusion
                - "dense_rerank": First get sparse candidates, then rerank with dense
                - "sparse_rerank": First get dense candidates, then rerank with sparse
        
        Returns:
            List of search results
        """
        dense_vector = self.dense_model.encode(query, normalize_embeddings=True).tolist()
        sparse_vector = self.create_sparse_vector_as_dense(query)
        
        if hybrid_method == "fusion":
            # For fusion, get results from both vectors separately and combine them
            try:
                dense_results = self.client.search(
                    collection_name=self.collection_name,
                    query_vector=("dense", dense_vector),
                    limit=limit * 2
                )
            except Exception as e:
                print(f"Error in dense search: {e}")
                dense_results = []
            
            try:
                sparse_results = self.client.search(
                    collection_name=self.collection_name,
                    query_vector=("sparse", sparse_vector),
                    limit=limit * 2
                )
            except Exception as e:
                print(f"Error in sparse search: {e}")
                sparse_results = []
            
            # Simple RRF fusion (Reciprocal Rank Fusion)
            dense_scores = {hit.id: 1.0/(1.0 + idx) for idx, hit in enumerate(dense_results)}
            sparse_scores = {hit.id: 1.0/(1.0 + idx) for idx, hit in enumerate(sparse_results)}
            
            # Combine all document IDs
            all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
            
            if not all_ids:
                return []
            
            # Calculate combined scores
            combined_scores = []
            for doc_id in all_ids:
                dense_score = dense_scores.get(doc_id, 0.0)
                sparse_score = sparse_scores.get(doc_id, 0.0)
                combined_score = dense_score + sparse_score
                combined_scores.append((doc_id, combined_score))
            
            # Sort by combined score
            combined_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Get top results
            top_ids = [doc_id for doc_id, _ in combined_scores[:limit]]
            
            if not top_ids:
                return []
            
            # Retrieve points
            try:
                results = self.client.retrieve(
                    collection_name=self.collection_name,
                    ids=top_ids
                )
            except Exception as e:
                print(f"Error retrieving points: {e}")
                return []
            
            # Map to common format with score
            search_results = []
            for point in results:
                score = next((score for id, score in combined_scores if id == point.id), 0.0)
                search_results.append({
                    'text': point.payload.get('text', ''),
                    'id': point.payload.get('id', point.id),
                    'score': score
                })
            
            # Sort by score
            search_results.sort(key=lambda x: x['score'], reverse=True)
        
        elif hybrid_method == "dense_rerank":
            # First get sparse candidates
            try:
                sparse_results = self.client.search(
                    collection_name=self.collection_name,
                    query_vector=("sparse", sparse_vector),
                    limit=limit * 3
                )
            except Exception as e:
                print(f"Error in sparse search: {e}")
                return []
            
            sparse_ids = [hit.id for hit in sparse_results]
            
            if not sparse_ids:
                return []
            
            # Retrieve full points for candidates
            try:
                candidate_points = self.client.retrieve(
                    collection_name=self.collection_name,
                    ids=sparse_ids
                )
            except Exception as e:
                print(f"Error retrieving points: {e}")
                return []
            
            # Manual scoring with dense vector
            dense_scores = []
            for point in candidate_points:
                # Skip points without the required vector
                if not hasattr(point, 'vector') or point.vector is None or "dense" not in point.vector:
                    continue
                    
                try:
                    # Calculate cosine similarity manually
                    dense_vec = np.array(point.vector["dense"])
                    if dense_vec is None:
                        continue
                        
                    dense_query_vec = np.array(dense_vector)
                    
                    # Normalize if needed
                    norm_dense = np.linalg.norm(dense_vec)
                    if norm_dense > 0:
                        dense_vec = dense_vec / norm_dense
                        
                    norm_query = np.linalg.norm(dense_query_vec)
                    if norm_query > 0:
                        dense_query_vec = dense_query_vec / norm_query
                    
                    # Calculate cosine similarity
                    similarity = np.dot(dense_vec, dense_query_vec)
                    
                    dense_scores.append((point, similarity))
                except Exception as e:
                    print(f"Error calculating similarity: {e}")
                    continue
            
            # Sort by dense score
            dense_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Format results
            search_results = [
                {
                    'text': point.payload.get('text', ''),
                    'id': point.payload.get('id', point.id),
                    'score': score
                }
                for point, score in dense_scores[:limit]
            ]
        
        elif hybrid_method == "sparse_rerank":
            # Dense first
            try:
                dense_results = self.client.search(
                    collection_name=self.collection_name,
                    query_vector=("dense", dense_vector),
                    limit=limit * 3
                )
            except Exception as e:
                print(f"Error in dense search: {e}")
                return []
            
            dense_ids = [hit.id for hit in dense_results]
            
            if not dense_ids:
                return []
            
            # Retrieve full points for candidates
            try:
                candidate_points = self.client.retrieve(
                    collection_name=self.collection_name,
                    ids=dense_ids
                )
            except Exception as e:
                print(f"Error retrieving points: {e}")
                return []
            
            # Manual scoring with sparse vector
            sparse_scores = []
            for point in candidate_points:
                # Skip points without the required vector
                if not hasattr(point, 'vector') or point.vector is None or "sparse" not in point.vector:
                    continue
                    
                try:
                    # Calculate cosine similarity manually
                    sparse_vec = np.array(point.vector["sparse"])
                    if sparse_vec is None:
                        continue
                        
                    sparse_query_vec = np.array(sparse_vector)
                    
                    # Normalize if needed
                    norm_sparse = np.linalg.norm(sparse_vec)
                    if norm_sparse > 0:
                        sparse_vec = sparse_vec / norm_sparse
                        
                    norm_query = np.linalg.norm(sparse_query_vec)
                    if norm_query > 0:
                        sparse_query_vec = sparse_query_vec / norm_query
                    
                    # Calculate cosine similarity
                    similarity = np.dot(sparse_vec, sparse_query_vec)
                    
                    sparse_scores.append((point, similarity))
                except Exception as e:
                    print(f"Error calculating similarity: {e}")
                    continue
            
            # Sort by sparse score
            sparse_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Format results
            search_results = [
                {
                    'text': point.payload.get('text', ''),
                    'id': point.payload.get('id', point.id),
                    'score': score
                }
                for point, score in sparse_scores[:limit]
            ]
        
        else:
            raise ValueError(f"Invalid hybrid method: {hybrid_method}")
        
        return search_results
    
    def weighted_hybrid_search(self, query: str, dense_weight: float = 0.5, 
                              sparse_weight: float = 0.5, limit: int = 6) -> List[Dict[str, Any]]:
        """
        Perform weighted hybrid search using a custom scoring function.
        
        Args:
            query: The search query
            dense_weight: Weight for dense vector scores (0.0 to 1.0)
            sparse_weight: Weight for sparse vector scores (0.0 to 1.0)
            limit: Number of results to return
            
        Returns:
            List of search results with custom-weighted scores
        """
        if not (0 <= dense_weight <= 1) or not (0 <= sparse_weight <= 1):
            raise ValueError("Weights must be between 0 and 1")
        
        # Normalize weights to sum to 1.0
        total_weight = dense_weight + sparse_weight
        if total_weight == 0:
            raise ValueError("At least one weight must be positive")
        
        dense_weight = dense_weight / total_weight
        sparse_weight = sparse_weight / total_weight
        
        # Encode query
        dense_vector = self.dense_model.encode(query, normalize_embeddings=True).tolist()
        sparse_vector = self.create_sparse_vector_as_dense(query)
        
        # Get candidates using both vectors for better coverage
        candidate_limit = limit * 3
        
        # Retrieve dense candidates
        dense_results = self.client.search(
            collection_name=self.collection_name,
            query_vector=("dense", dense_vector),
            limit=candidate_limit
        )
        
        # Retrieve sparse candidates
        sparse_results = self.client.search(
            collection_name=self.collection_name,
            query_vector=("sparse", sparse_vector),
            limit=candidate_limit
        )
        
        # Combine candidate IDs
        dense_scores = {hit.id: hit.score for hit in dense_results}
        sparse_scores = {hit.id: hit.score for hit in sparse_results}
        all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
        
        # Score candidates using weighted formula
        combined_scores = []
        for doc_id in all_ids:
            dense_score = dense_scores.get(doc_id, 0.0)
            sparse_score = sparse_scores.get(doc_id, 0.0)
            
            # Weighted combination
            combined_score = (dense_weight * dense_score) + (sparse_weight * sparse_score)
            
            combined_scores.append((doc_id, combined_score))
        
        # Sort by combined score and get top results
        combined_scores.sort(key=lambda x: x[1], reverse=True)
        top_ids = [doc_id for doc_id, _ in combined_scores[:limit]]
        
        if not top_ids:
            return []
        
        # Fetch full results for top IDs
        points = self.client.retrieve(
            collection_name=self.collection_name,
            ids=top_ids
        )
        
        # Format results
        results = []
        for point in points:
            score = next((score for id, score in combined_scores if id == point.id), 0.0)
            results.append({
                'text': point.payload.get('text', ''),
                'id': point.payload.get('id', point.id),
                'score': score
            })
        
        # Sort by score
        results.sort(key=lambda x: x['score'], reverse=True)
        
        return results

class MetricsCalculator:
    """Class for calculating search quality metrics."""
    
    @staticmethod
    def calculate_recall_at_k(relevant_ids: List[int], retrieved_ids: List[int], k: int) -> float:
        """Calculate Recall@k for a single query."""
        if not relevant_ids:
            return 0.0
        
        relevant_retrieved = set(relevant_ids).intersection(set(retrieved_ids[:k]))
        return len(relevant_retrieved) / len(relevant_ids)
    
    @staticmethod
    def calculate_mrr_at_k(relevant_ids: List[int], retrieved_ids: List[int], k: int) -> float:
        """Calculate MRR@k (Mean Reciprocal Rank) for a single query."""
        if not relevant_ids or not retrieved_ids:
            return 0.0
        
        for i, doc_id in enumerate(retrieved_ids[:k]):
            if doc_id in relevant_ids:
                return 1.0 / (i + 1)
        return 0.0
    
    @staticmethod
    def compute_average_metrics(metrics_list: List[Dict[str, float]]) -> Dict[str, float]:
        """Calculate average metrics across all queries."""
        if not metrics_list:
            return {}
        
        result = {}
        all_keys = set()
        for metrics in metrics_list:
            all_keys.update(metrics.keys())
        
        for key in all_keys:
            values = [metrics.get(key, 0.0) for metrics in metrics_list]
            result[key] = sum(values) / len(metrics_list)
        
        return result

class HybridSearchEvaluator:
    """Evaluates hybrid search performance."""
    
    def __init__(self, data_path: str = 'qdrant_documentation_dataset.csv'):
        self.hybrid_search = HybridSearchManager()
        self.metrics_calculator = MetricsCalculator()
        self.df = None
        self.section_id_map = {}
        self.is_initialized = False
        self.load_data(data_path)
    
    def load_data(self, file_path: str):
        """Load data from CSV file."""
        self.df = pd.read_csv(file_path)
        print(f"Loaded {len(self.df)} records from dataset.")
    
    def initialize_database(self):
        """Initialize the database with documents from the dataset."""
        if self.is_initialized:
            return
        
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        # Get unique chunks of content
        sections = self.df['section_content'].unique()
        print(f"Found {len(sections)} unique chunks for indexing.")
        
        # Create mapping of section_content -> id
        for idx, section in enumerate(sections):
            self.section_id_map[section] = idx
        
        # Initialize Qdrant collection
        self.hybrid_search.initialize_collection()
        
        # Compute embeddings and add to Qdrant
        batch_size = 50  # Smaller batch size to avoid memory issues
        for i in range(0, len(sections), batch_size):
            batch = sections[i:i + batch_size]
            
            # Prepare payloads
            payloads = [
                {
                    'text': section,
                    'id': self.section_id_map[section]
                }
                for section in batch
            ]
            
            # Add to Qdrant
            print(f"Processing batch {i//batch_size + 1}/{(len(sections) - 1)//batch_size + 1}")
            self.hybrid_search.upsert_batch(i, batch, payloads)
        
        self.is_initialized = True
        print("Database successfully initialized.")
    
    def evaluate_hybrid_search(self, hybrid_methods: List[str] = ["fusion", "dense_rerank", "sparse_rerank"],
                              k_values: List[int] = [1, 4, 6]) -> Dict[str, Dict[str, float]]:
        """Evaluate hybrid search with different methods and metrics."""
        if not self.is_initialized:
            self.initialize_database()
        
        results = {}
        
        for method in hybrid_methods:
            print(f"\n===== Evaluating hybrid method: {method} =====")
            
            all_metrics = []
            
            # Process each question in the dataset
            for idx, row in tqdm(self.df.iterrows(), total=len(self.df), 
                               desc=f"Evaluating {method}"):
                question = row['question']
                relevant_section = row['section_content']
                relevant_id = self.section_id_map[relevant_section]
                
                # Get search results
                search_results = self.hybrid_search.hybrid_search(
                    question, 
                    limit=max(k_values),
                    hybrid_method=method
                )
                
                # Check if we got valid results
                if not search_results:
                    # Empty results, count as 0 for all metrics
                    query_metrics = {f'Recall@{k}': 0.0 for k in k_values}
                    query_metrics.update({f'MRR@{k}': 0.0 for k in k_values})
                else:
                    # Extract IDs from results
                    retrieved_ids = [result['id'] for result in search_results]
                    
                    # Calculate metrics for current query
                    query_metrics = {}
                    for k in k_values:
                        query_metrics[f'Recall@{k}'] = self.metrics_calculator.calculate_recall_at_k(
                            [relevant_id], retrieved_ids, k)
                        query_metrics[f'MRR@{k}'] = self.metrics_calculator.calculate_mrr_at_k(
                            [relevant_id], retrieved_ids, k)
                
                all_metrics.append(query_metrics)
            
            # Compute average metrics across all queries
            average_metrics = self.metrics_calculator.compute_average_metrics(all_metrics)
            
            # Display results
            for metric_name, value in sorted(average_metrics.items()):
                print(f"{metric_name}: {value:.4f}")
            
            # Store results for this method
            results[method] = average_metrics
        
        return results
    
    def compare_with_baseline(self, results: Dict[str, Dict[str, float]]):
        """Compare hybrid search methods with baseline."""
        # Baseline metrics from earlier experiments
        baseline = {
            'MRR@1': 0.6250,
            'MRR@4': 0.7790,
            'MRR@6': 0.7836,
            'Recall@1': 0.7104,
            'Recall@4': 0.8780,
            'Recall@6': 0.9024
        }
        
        # Add baseline to results
        results['Best from previous experiments'] = baseline
        
        # Print comparison table
        print("\n===== Comparison with baseline =====")
        
        # Get all metrics from results
        metrics = set()
        for method_metrics in results.values():
            metrics.update(method_metrics.keys())
        
        # For consistent ordering
        metrics = sorted(metrics)
        methods = sorted([m for m in results.keys() if m != 'Best from previous experiments']) + ['Best from previous experiments']
        
        # Print header
        header = "Metric     | " + " | ".join(f"{method:<16}" for method in methods)
        print(header)
        print("-" * len(header))
        
        # Prioritize MRR and Recall metrics
        metrics_order = []
        for prefix in ['MRR@', 'Recall@']:
            for metric in metrics:
                if metric.startswith(prefix):
                    metrics_order.append(metric)
        
        for metric in metrics_order:
            baseline_value = baseline.get(metric, 0.0)
            row = f"{metric:<10} | "
            
            for method in methods:
                value = results[method].get(metric, 0.0)
                
                # Calculate percentage difference from baseline
                if baseline_value > 0:
                    pct_diff = ((value - baseline_value) / baseline_value) * 100
                    if method == 'Best from previous experiments':
                        direction = "↓"  # Just a neutral indicator for baseline
                        pct_diff = 0.0  # No difference from itself
                    else:
                        direction = "↑" if pct_diff > 0 else "↓"
                    
                    row += f"{value:.4f} {direction} {abs(pct_diff):.1f}% | "
                else:
                    row += f"{value:.4f} | "
            
            print(row.rstrip(" | "))
        
        # Find best method for each metric
        print("\n===== Best Methods =====")
        for metric in metrics_order:
            best_method = None
            best_value = -1
            for method in results:
                value = results[method].get(metric, 0.0)
                if value > best_value:
                    best_value = value
                    best_method = method
            
            # Calculate improvement over baseline
            baseline_value = baseline.get(metric, 0.0)
            if baseline_value > 0:
                pct_diff = ((best_value - baseline_value) / baseline_value) * 100
                print(f"{metric}: {best_method} ({best_value:.4f}, {'+' if pct_diff > 0 else ''}{pct_diff:.1f}%)")
            else:
                print(f"{metric}: {best_method} ({best_value:.4f})")
    
    def find_optimal_weights(self, k_values: List[int] = [1, 4, 6], 
                          weight_steps: int = 5) -> Dict[str, Any]:
        """
        Perform grid search to find optimal weights for hybrid search.
        
        Args:
            k_values: List of k values for evaluation metrics
            weight_steps: Number of steps for grid search (from 0 to 1)
            
        Returns:
            Dictionary with grid search results and best configuration
        """
        if not self.is_initialized:
            self.initialize_database()
        
        print("\n===== Finding optimal weights for hybrid search =====")
        
        # Generate weight combinations
        weights = np.linspace(0, 1, weight_steps + 1)
        
        results = {}
        best_config = {
            "weights": None,
            "metrics": None,
            "best_mrr": 0.0
        }
        
        # For tracking progress
        total_configs = len(weights) * len(weights)
        config_count = 0
        
        for dense_weight in weights:
            for sparse_weight in weights:
                # Skip if both weights are 0
                if dense_weight == 0 and sparse_weight == 0:
                    continue
                
                config_count += 1
                print(f"\nTesting weights {config_count}/{total_configs}: "
                      f"Dense={dense_weight:.2f}, Sparse={sparse_weight:.2f}")
                
                weight_config = (float(dense_weight), float(sparse_weight))  # Convert to float explicitly
                all_metrics = []
                
                # Process a sample of questions for efficiency
                sample_size = min(50, len(self.df))
                sample_df = self.df.sample(sample_size, random_state=42)  # Fixed random state for reproducibility
                
                for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), 
                                   desc="Evaluating weights"):
                    question = row['question']
                    relevant_section = row['section_content']
                    relevant_id = self.section_id_map[relevant_section]
                    
                    # Get search results with current weights
                    search_results = self.hybrid_search.weighted_hybrid_search(
                        question,
                        dense_weight=dense_weight,
                        sparse_weight=sparse_weight,
                        limit=max(k_values)
                    )
                    
                    # Check if we got valid results
                    if not search_results:
                        # Empty results, count as 0 for all metrics
                        query_metrics = {f'Recall@{k}': 0.0 for k in k_values}
                        query_metrics.update({f'MRR@{k}': 0.0 for k in k_values})
                    else:
                        # Calculate metrics
                        retrieved_ids = [result['id'] for result in search_results]
                        
                        # Calculate metrics for current query
                        query_metrics = {}
                        for k in k_values:
                            query_metrics[f'Recall@{k}'] = self.metrics_calculator.calculate_recall_at_k(
                                [relevant_id], retrieved_ids, k)
                            query_metrics[f'MRR@{k}'] = self.metrics_calculator.calculate_mrr_at_k(
                                [relevant_id], retrieved_ids, k)
                    
                    all_metrics.append(query_metrics)
                
                # Calculate average metrics
                average_metrics = self.metrics_calculator.compute_average_metrics(all_metrics)
                results[weight_config] = average_metrics
                
                # Print current results
                for metric_name, value in sorted(average_metrics.items()):
                    print(f"{metric_name}: {value:.4f}")
                
                # Update best configuration based on MRR@4
                mrr4 = average_metrics.get('MRR@4', 0.0)
                if mrr4 > best_config['best_mrr']:
                    best_config['best_mrr'] = mrr4
                    best_config['weights'] = weight_config
                    best_config['metrics'] = average_metrics
        
        # Print the best configuration
        print("\n===== Best Weight Configuration =====")
        print(f"Dense Weight: {best_config['weights'][0]:.2f}")
        print(f"Sparse Weight: {best_config['weights'][1]:.2f}")
        print("Metrics:")
        for metric_name, value in sorted(best_config['metrics'].items()):
            print(f"{metric_name}: {value:.4f}")
        
        return {"results": results, "best": best_config}

In [49]:
def weighted_hybrid_search(self, query: str, dense_weight: float = 0.5, 
                          sparse_weight: float = 0.5, limit: int = 6) -> List[Dict[str, Any]]:
    """
    Perform weighted hybrid search using a custom scoring function.
    
    Args:
        query: The search query
        dense_weight: Weight for dense vector scores (0.0 to 1.0)
        sparse_weight: Weight for sparse vector scores (0.0 to 1.0)
        limit: Number of results to return
        
    Returns:
        List of search results with custom-weighted scores
    """
    if not (0 <= dense_weight <= 1) or not (0 <= sparse_weight <= 1):
        raise ValueError("Weights must be between 0 and 1")
    
    # Normalize weights to sum to 1.0
    total_weight = dense_weight + sparse_weight
    if total_weight == 0:
        raise ValueError("At least one weight must be positive")
    
    dense_weight = dense_weight / total_weight
    sparse_weight = sparse_weight / total_weight
    
    # Encode query
    dense_vector = self.dense_model.encode(query, normalize_embeddings=True).tolist()
    sparse_vector = self.create_sparse_vector_as_dense(query)
    
    # Get candidates using both vectors for better coverage
    candidate_limit = limit * 3
    
    # Retrieve dense candidates
    try:
        dense_results = self.client.search(
            collection_name=self.collection_name,
            query_vector=("dense", dense_vector),
            limit=candidate_limit
        )
    except Exception as e:
        print(f"Error in dense search: {e}")
        dense_results = []
    
    # Retrieve sparse candidates
    try:
        sparse_results = self.client.search(
            collection_name=self.collection_name,
            query_vector=("sparse", sparse_vector),
            limit=candidate_limit
        )
    except Exception as e:
        print(f"Error in sparse search: {e}")
        sparse_results = []
    
    # Combine candidate IDs
    dense_scores = {hit.id: hit.score for hit in dense_results}
    sparse_scores = {hit.id: hit.score for hit in sparse_results}
    all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
    
    if not all_ids:
        return []
    
    # Score candidates using weighted formula
    combined_scores = []
    for doc_id in all_ids:
        dense_score = dense_scores.get(doc_id, 0.0)
        sparse_score = sparse_scores.get(doc_id, 0.0)
        
        # Weighted combination
        combined_score = (dense_weight * dense_score) + (sparse_weight * sparse_score)
        
        combined_scores.append((doc_id, combined_score))
    
    # Sort by combined score and get top results
    combined_scores.sort(key=lambda x: x[1], reverse=True)
    top_ids = [doc_id for doc_id, _ in combined_scores[:limit]]
    
    if not top_ids:
        return []
    
    # Fetch full results for top IDs
    try:
        points = self.client.retrieve(
            collection_name=self.collection_name,
            ids=top_ids
        )
    except Exception as e:
        print(f"Error retrieving points: {e}")
        return []
    
    # Format results
    results = []
    for point in points:
        score = next((score for id, score in combined_scores if id == point.id), 0.0)
        results.append({
            'text': point.payload.get('text', ''),
            'id': point.payload.get('id', point.id),
            'score': score
        })
    
    # Sort by score
    results.sort(key=lambda x: x['score'], reverse=True)
    
    return results

In [50]:
def weighted_hybrid_search(self, query: str, dense_weight: float = 0.5, 
                          sparse_weight: float = 0.5, limit: int = 6) -> List[Dict[str, Any]]:
    """
    Perform weighted hybrid search using a custom scoring function.
    
    Args:
        query: The search query
        dense_weight: Weight for dense vector scores (0.0 to 1.0)
        sparse_weight: Weight for sparse vector scores (0.0 to 1.0)
        limit: Number of results to return
        
    Returns:
        List of search results with custom-weighted scores
    """
    if not (0 <= dense_weight <= 1) or not (0 <= sparse_weight <= 1):
        raise ValueError("Weights must be between 0 and 1")
    
    # Normalize weights to sum to 1.0
    total_weight = dense_weight + sparse_weight
    if total_weight == 0:
        raise ValueError("At least one weight must be positive")
    
    dense_weight = dense_weight / total_weight
    sparse_weight = sparse_weight / total_weight
    
    # Encode query
    dense_vector = self.dense_model.encode(query, normalize_embeddings=True).tolist()
    sparse_vector = self.create_sparse_vector(query)
    
    # Get candidates using both vectors for better coverage
    candidate_limit = limit * 3
    
    # Retrieve dense candidates
    dense_results = self.client.search(
        collection_name=self.collection_name,
        query_vector=("dense", dense_vector),
        limit=candidate_limit
    )
    
    # Retrieve sparse candidates
    sparse_results = self.client.search(
        collection_name=self.collection_name,
        query_vector=("sparse", sparse_vector),
        limit=candidate_limit
    )
    
    # Combine candidate IDs
    dense_ids = {hit.id: hit.score for hit in dense_results}
    sparse_ids = {hit.id: hit.score for hit in sparse_results}
    all_ids = set(dense_ids.keys()) | set(sparse_ids.keys())
    
    # Score candidates using weighted formula
    combined_scores = []
    for doc_id in all_ids:
        dense_score = dense_ids.get(doc_id, 0.0)
        sparse_score = sparse_ids.get(doc_id, 0.0)
        
        # Weighted combination
        combined_score = (dense_weight * dense_score) + (sparse_weight * sparse_score)
        
        combined_scores.append((doc_id, combined_score))
    
    # Sort by combined score and get top results
    combined_scores.sort(key=lambda x: x[1], reverse=True)
    top_ids = combined_scores[:limit]
    
    # Fetch full results for top IDs
    results = []
    for doc_id, score in top_ids:
        point = self.client.retrieve(
            collection_name=self.collection_name,
            ids=[doc_id]
        )[0]
        
        results.append({
            'text': point.payload.get('text', ''),
            'id': point.payload.get('id', point.id),
            'score': score
        })
    
    return results

In [51]:
def find_optimal_weights(self, k_values: List[int] = [1, 4, 6], 
                        weight_steps: int = 5) -> Dict[str, Any]:
    """
    Perform grid search to find optimal weights for hybrid search.
    
    Args:
        k_values: List of k values for evaluation metrics
        weight_steps: Number of steps for grid search (from 0 to 1)
        
    Returns:
        Dictionary with grid search results and best configuration
    """
    if not self.is_initialized:
        self.initialize_database()
    
    print("\n===== Finding optimal weights for hybrid search =====")
    
    # Generate weight combinations
    weights = np.linspace(0, 1, weight_steps + 1)
    
    results = {}
    best_config = {
        "weights": None,
        "metrics": None,
        "best_mrr": 0.0
    }
    
    # For tracking progress
    total_configs = len(weights) * len(weights)
    config_count = 0
    
    for dense_weight in weights:
        for sparse_weight in weights:
            # Skip if both weights are 0
            if dense_weight == 0 and sparse_weight == 0:
                continue
            
            config_count += 1
            print(f"\nTesting weights {config_count}/{total_configs}: "
                  f"Dense={dense_weight:.2f}, Sparse={sparse_weight:.2f}")
            
            weight_config = (dense_weight, sparse_weight)
            all_metrics = []
            
            # Process a sample of questions for efficiency
            sample_size = min(50, len(self.df))
            sample_df = self.df.sample(sample_size)
            
            for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), 
                               desc="Evaluating weights"):
                question = row['question']
                relevant_section = row['section_content']
                relevant_id = self.section_id_map[relevant_section]
                
                # Get search results with current weights
                search_results = self.hybrid_search.weighted_hybrid_search(
                    question,
                    dense_weight=dense_weight,
                    sparse_weight=sparse_weight,
                    limit=max(k_values)
                )
                retrieved_ids = [result['id'] for result in search_results]
                
                # Calculate metrics for current query
                query_metrics = {}
                for k in k_values:
                    query_metrics[f'Recall@{k}'] = self.metrics_calculator.calculate_recall_at_k(
                        [relevant_id], retrieved_ids, k)
                    query_metrics[f'MRR@{k}'] = self.metrics_calculator.calculate_mrr_at_k(
                        [relevant_id], retrieved_ids, k)
                
                all_metrics.append(query_metrics)
            
            # Calculate average metrics
            average_metrics = self.metrics_calculator.compute_average_metrics(all_metrics)
            results[weight_config] = average_metrics
            
            # Print current results
            for metric_name, value in sorted(average_metrics.items()):
                print(f"{metric_name}: {value:.4f}")
            
            # Update best configuration based on MRR@4
            mrr4 = average_metrics.get('MRR@4', 0.0)
            if mrr4 > best_config['best_mrr']:
                best_config['best_mrr'] = mrr4
                best_config['weights'] = weight_config
                best_config['metrics'] = average_metrics
    
    # Print the best configuration
    print("\n===== Best Weight Configuration =====")
    print(f"Dense Weight: {best_config['weights'][0]:.2f}")
    print(f"Sparse Weight: {best_config['weights'][1]:.2f}")
    print("Metrics:")
    for metric_name, value in sorted(best_config['metrics'].items()):
        print(f"{metric_name}: {value:.4f}")
    
    return {"results": results, "best": best_config}

In [52]:
def find_optimal_weights(self, k_values: List[int] = [1, 4, 6], 
                        weight_steps: int = 5) -> Dict[str, Any]:
    """
    Perform grid search to find optimal weights for hybrid search.
    
    Args:
        k_values: List of k values for evaluation metrics
        weight_steps: Number of steps for grid search (from 0 to 1)
        
    Returns:
        Dictionary with grid search results and best configuration
    """
    if not self.is_initialized:
        self.initialize_database()
    
    print("\n===== Finding optimal weights for hybrid search =====")
    
    # Generate weight combinations
    weights = np.linspace(0, 1, weight_steps + 1)
    
    results = {}
    best_config = {
        "weights": None,
        "metrics": None,
        "best_mrr": 0.0
    }
    
    # For tracking progress
    total_configs = len(weights) * len(weights)
    config_count = 0
    
    for dense_weight in weights:
        for sparse_weight in weights:
            # Skip if both weights are 0
            if dense_weight == 0 and sparse_weight == 0:
                continue
            
            config_count += 1
            print(f"\nTesting weights {config_count}/{total_configs}: "
                  f"Dense={dense_weight:.2f}, Sparse={sparse_weight:.2f}")
            
            weight_config = (float(dense_weight), float(sparse_weight))  # Convert to float explicitly
            all_metrics = []
            
            # Process a sample of questions for efficiency
            sample_size = min(50, len(self.df))
            sample_df = self.df.sample(sample_size, random_state=42)  # Fixed random state for reproducibility
            
            for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), 
                               desc="Evaluating weights"):
                question = row['question']
                relevant_section = row['section_content']
                relevant_id = self.section_id_map[relevant_section]
                
                # Get search results with current weights
                search_results = self.hybrid_search.weighted_hybrid_search(
                    question,
                    dense_weight=dense_weight,
                    sparse_weight=sparse_weight,
                    limit=max(k_values)
                )
                
                # Check if we got valid results
                if not search_results:
                    # Empty results, count as 0 for all metrics
                    query_metrics = {f'Recall@{k}': 0.0 for k in k_values}
                    query_metrics.update({f'MRR@{k}': 0.0 for k in k_values})
                else:
                    # Calculate metrics
                    retrieved_ids = [result['id'] for result in search_results]
                    
                    # Calculate metrics for current query
                    query_metrics = {}
                    for k in k_values:
                        query_metrics[f'Recall@{k}'] = self.metrics_calculator.calculate_recall_at_k(
                            [relevant_id], retrieved_ids, k)
                        query_metrics[f'MRR@{k}'] = self.metrics_calculator.calculate_mrr_at_k(
                            [relevant_id], retrieved_ids, k)
                
                all_metrics.append(query_metrics)
            
            # Calculate average metrics
            average_metrics = self.metrics_calculator.compute_average_metrics(all_metrics)
            results[weight_config] = average_metrics
            
            # Print current results
            for metric_name, value in sorted(average_metrics.items()):
                print(f"{metric_name}: {value:.4f}")
            
            # Update best configuration based on MRR@4
            mrr4 = average_metrics.get('MRR@4', 0.0)
            if mrr4 > best_config['best_mrr']:
                best_config['best_mrr'] = mrr4
                best_config['weights'] = weight_config
                best_config['metrics'] = average_metrics
    
    # Print the best configuration
    print("\n===== Best Weight Configuration =====")
    print(f"Dense Weight: {best_config['weights'][0]:.2f}")
    print(f"Sparse Weight: {best_config['weights'][1]:.2f}")
    print("Metrics:")
    for metric_name, value in sorted(best_config['metrics'].items()):
        print(f"{metric_name}: {value:.4f}")
    
    return {"results": results, "best": best_config}

In [53]:
def main():
    """Run hybrid search evaluation experiments."""
    print("===== Hybrid Search Evaluation =====")
    evaluator = HybridSearchEvaluator('qdrant_documentation_dataset.csv')
    
    # Evaluate standard hybrid search methods
    hybrid_results = evaluator.evaluate_hybrid_search(
        hybrid_methods=["fusion", "dense_rerank", "sparse_rerank"],
        k_values=[1, 4, 6]
    )
    
    # Compare with baseline
    evaluator.compare_with_baseline(hybrid_results)
    
    # Find optimal weights with grid search
    grid_results = evaluator.find_optimal_weights(
        k_values=[1, 4, 6],
        weight_steps=5
    )
    
    # Evaluate with the best weights
    best_dense_weight, best_sparse_weight = grid_results["best"]["weights"]
    print("\n===== Evaluating with best weights =====")
    
    all_metrics = []
    k_values = [1, 4, 6]
    
    # Process all questions with the best weights
    for idx, row in tqdm(evaluator.df.iterrows(), total=len(evaluator.df), 
                       desc="Evaluating best weights"):
        question = row['question']
        relevant_section = row['section_content']
        relevant_id = evaluator.section_id_map[relevant_section]
        
        # Get search results with optimal weights
        search_results = evaluator.hybrid_search.weighted_hybrid_search(
            question,
            dense_weight=best_dense_weight,
            sparse_weight=best_sparse_weight,
            limit=max(k_values)
        )
        
        # Check if we got valid results
        if not search_results:
            # Empty results, count as 0 for all metrics
            query_metrics = {f'Recall@{k}': 0.0 for k in k_values}
            query_metrics.update({f'MRR@{k}': 0.0 for k in k_values})
        else:
            # Calculate metrics
            retrieved_ids = [result['id'] for result in search_results]
            
            # Calculate metrics for current query
            query_metrics = {}
            for k in k_values:
                query_metrics[f'Recall@{k}'] = evaluator.metrics_calculator.calculate_recall_at_k(
                    [relevant_id], retrieved_ids, k)
                query_metrics[f'MRR@{k}'] = evaluator.metrics_calculator.calculate_mrr_at_k(
                    [relevant_id], retrieved_ids, k)
        
        all_metrics.append(query_metrics)
    
    # Calculate average metrics for best weights
    best_weights_metrics = evaluator.metrics_calculator.compute_average_metrics(all_metrics)
    
    # Add to comparison
    hybrid_results['weighted_optimal'] = best_weights_metrics
    
    # Compare all results
    evaluator.compare_with_baseline(hybrid_results)
    
    # Save results to CSV
    save_results_to_csv(hybrid_results, grid_results, "hybrid_search_results.csv")

if __name__ == "__main__":
    main()

===== Hybrid Search Evaluation =====
Loaded 328 records from dataset.
Found 121 unique chunks for indexing.
Processing batch 1/3
Processing batch 2/3
Processing batch 3/3
Database successfully initialized.

===== Evaluating hybrid method: fusion =====


  query = query / np.linalg.norm(query)
Evaluating fusion: 100%|██████████| 328/328 [01:24<00:00,  3.90it/s]


MRR@1: 0.6463
MRR@4: 0.7508
MRR@6: 0.7581
Recall@1: 0.6463
Recall@4: 0.8811
Recall@6: 0.9207

===== Evaluating hybrid method: dense_rerank =====


Evaluating dense_rerank: 100%|██████████| 328/328 [00:35<00:00,  9.35it/s]


MRR@1: 0.0000
MRR@4: 0.0000
MRR@6: 0.0000
Recall@1: 0.0000
Recall@4: 0.0000
Recall@6: 0.0000

===== Evaluating hybrid method: sparse_rerank =====


Evaluating sparse_rerank: 100%|██████████| 328/328 [00:33<00:00,  9.66it/s]


MRR@1: 0.0000
MRR@4: 0.0000
MRR@6: 0.0000
Recall@1: 0.0000
Recall@4: 0.0000
Recall@6: 0.0000

===== Comparison with baseline =====
Metric     | dense_rerank     | fusion           | sparse_rerank    | Best from previous experiments
----------------------------------------------------------------------------------------------------
MRR@1      | 0.0000 ↓ 100.0% | 0.6463 ↑ 3.4% | 0.0000 ↓ 100.0% | 0.6250 ↓ 0.0%
MRR@4      | 0.0000 ↓ 100.0% | 0.7508 ↓ 3.6% | 0.0000 ↓ 100.0% | 0.7790 ↓ 0.0%
MRR@6      | 0.0000 ↓ 100.0% | 0.7581 ↓ 3.3% | 0.0000 ↓ 100.0% | 0.7836 ↓ 0.0%
Recall@1   | 0.0000 ↓ 100.0% | 0.6463 ↓ 9.0% | 0.0000 ↓ 100.0% | 0.7104 ↓ 0.0%
Recall@4   | 0.0000 ↓ 100.0% | 0.8811 ↑ 0.4% | 0.0000 ↓ 100.0% | 0.8780 ↓ 0.0%
Recall@6   | 0.0000 ↓ 100.0% | 0.9207 ↑ 2.0% | 0.0000 ↓ 100.0% | 0.9024 ↓ 0.0%

===== Best Methods =====
MRR@1: fusion (0.6463, +3.4%)
MRR@4: Best from previous experiments (0.7790, 0.0%)
MRR@6: Best from previous experiments (0.7836, 0.0%)
Recall@1: Best from previous ex

Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.79it/s]


MRR@1: 0.5200
MRR@4: 0.5950
MRR@6: 0.6023
Recall@1: 0.5200
Recall@4: 0.7200
Recall@6: 0.7600

Testing weights 2/36: Dense=0.00, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.13it/s]


MRR@1: 0.5200
MRR@4: 0.5950
MRR@6: 0.6023
Recall@1: 0.5200
Recall@4: 0.7200
Recall@6: 0.7600

Testing weights 3/36: Dense=0.00, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.42it/s]


MRR@1: 0.5200
MRR@4: 0.5950
MRR@6: 0.6023
Recall@1: 0.5200
Recall@4: 0.7200
Recall@6: 0.7600

Testing weights 4/36: Dense=0.00, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:04<00:00, 10.33it/s]


MRR@1: 0.5200
MRR@4: 0.5950
MRR@6: 0.6023
Recall@1: 0.5200
Recall@4: 0.7200
Recall@6: 0.7600

Testing weights 5/36: Dense=0.00, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.81it/s]


MRR@1: 0.5200
MRR@4: 0.5950
MRR@6: 0.6023
Recall@1: 0.5200
Recall@4: 0.7200
Recall@6: 0.7600

Testing weights 6/36: Dense=0.20, Sparse=0.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.96it/s]


MRR@1: 0.6000
MRR@4: 0.6867
MRR@6: 0.6907
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 7/36: Dense=0.20, Sparse=0.20


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.53it/s]


MRR@1: 0.6400
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6400
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 8/36: Dense=0.20, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.64it/s]


MRR@1: 0.5800
MRR@4: 0.6650
MRR@6: 0.6683
Recall@1: 0.5800
Recall@4: 0.7800
Recall@6: 0.8000

Testing weights 9/36: Dense=0.20, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.78it/s]


MRR@1: 0.5200
MRR@4: 0.6350
MRR@6: 0.6423
Recall@1: 0.5200
Recall@4: 0.7800
Recall@6: 0.8200

Testing weights 10/36: Dense=0.20, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.56it/s]


MRR@1: 0.5400
MRR@4: 0.6433
MRR@6: 0.6473
Recall@1: 0.5400
Recall@4: 0.7800
Recall@6: 0.8000

Testing weights 11/36: Dense=0.20, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.13it/s]


MRR@1: 0.5200
MRR@4: 0.6317
MRR@6: 0.6357
Recall@1: 0.5200
Recall@4: 0.7800
Recall@6: 0.8000

Testing weights 12/36: Dense=0.40, Sparse=0.00


Evaluating weights: 100%|██████████| 50/50 [00:04<00:00, 10.15it/s]


MRR@1: 0.6000
MRR@4: 0.6867
MRR@6: 0.6907
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 13/36: Dense=0.40, Sparse=0.20


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.71it/s]


MRR@1: 0.6200
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6200
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 14/36: Dense=0.40, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.29it/s]


MRR@1: 0.6400
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6400
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 15/36: Dense=0.40, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.40it/s]


MRR@1: 0.5800
MRR@4: 0.6667
MRR@6: 0.6733
Recall@1: 0.5800
Recall@4: 0.7800
Recall@6: 0.8200

Testing weights 16/36: Dense=0.40, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.98it/s]


MRR@1: 0.5800
MRR@4: 0.6650
MRR@6: 0.6683
Recall@1: 0.5800
Recall@4: 0.7800
Recall@6: 0.8000

Testing weights 17/36: Dense=0.40, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:04<00:00, 10.27it/s]


MRR@1: 0.5600
MRR@4: 0.6550
MRR@6: 0.6623
Recall@1: 0.5600
Recall@4: 0.7800
Recall@6: 0.8200

Testing weights 18/36: Dense=0.60, Sparse=0.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.76it/s]


MRR@1: 0.6000
MRR@4: 0.6867
MRR@6: 0.6907
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 19/36: Dense=0.60, Sparse=0.20


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.32it/s]


MRR@1: 0.6600
MRR@4: 0.7350
MRR@6: 0.7350
Recall@1: 0.6600
Recall@4: 0.8200
Recall@6: 0.8200

Testing weights 20/36: Dense=0.60, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.34it/s]


MRR@1: 0.6000
MRR@4: 0.6883
MRR@6: 0.6923
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 21/36: Dense=0.60, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.67it/s]


MRR@1: 0.6400
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6400
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 22/36: Dense=0.60, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.59it/s]


MRR@1: 0.6000
MRR@4: 0.6767
MRR@6: 0.6840
Recall@1: 0.6000
Recall@4: 0.7800
Recall@6: 0.8200

Testing weights 23/36: Dense=0.60, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:04<00:00, 10.29it/s]


MRR@1: 0.5800
MRR@4: 0.6667
MRR@6: 0.6700
Recall@1: 0.5800
Recall@4: 0.7800
Recall@6: 0.8000

Testing weights 24/36: Dense=0.80, Sparse=0.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.57it/s]


MRR@1: 0.6000
MRR@4: 0.6867
MRR@6: 0.6907
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 25/36: Dense=0.80, Sparse=0.20


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.95it/s]


MRR@1: 0.6600
MRR@4: 0.7383
MRR@6: 0.7417
Recall@1: 0.6600
Recall@4: 0.8400
Recall@6: 0.8600

Testing weights 26/36: Dense=0.80, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.87it/s]


MRR@1: 0.6200
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6200
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 27/36: Dense=0.80, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.03it/s]


MRR@1: 0.6000
MRR@4: 0.6883
MRR@6: 0.6923
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 28/36: Dense=0.80, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.12it/s]


MRR@1: 0.6400
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6400
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 29/36: Dense=0.80, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.27it/s]


MRR@1: 0.6200
MRR@4: 0.6900
MRR@6: 0.6973
Recall@1: 0.6200
Recall@4: 0.7800
Recall@6: 0.8200

Testing weights 30/36: Dense=1.00, Sparse=0.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.52it/s]


MRR@1: 0.6000
MRR@4: 0.6867
MRR@6: 0.6907
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 31/36: Dense=1.00, Sparse=0.20


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.80it/s]


MRR@1: 0.6400
MRR@4: 0.7250
MRR@6: 0.7290
Recall@1: 0.6400
Recall@4: 0.8400
Recall@6: 0.8600

Testing weights 32/36: Dense=1.00, Sparse=0.40


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.15it/s]


MRR@1: 0.6400
MRR@4: 0.7250
MRR@6: 0.7250
Recall@1: 0.6400
Recall@4: 0.8200
Recall@6: 0.8200

Testing weights 33/36: Dense=1.00, Sparse=0.60


Evaluating weights: 100%|██████████| 50/50 [00:04<00:00, 10.13it/s]


MRR@1: 0.6000
MRR@4: 0.6950
MRR@6: 0.6990
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 34/36: Dense=1.00, Sparse=0.80


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  9.24it/s]


MRR@1: 0.6000
MRR@4: 0.6883
MRR@6: 0.6923
Recall@1: 0.6000
Recall@4: 0.8000
Recall@6: 0.8200

Testing weights 35/36: Dense=1.00, Sparse=1.00


Evaluating weights: 100%|██████████| 50/50 [00:05<00:00,  8.93it/s]


MRR@1: 0.6400
MRR@4: 0.7050
MRR@6: 0.7090
Recall@1: 0.6400
Recall@4: 0.8000
Recall@6: 0.8200

===== Best Weight Configuration =====
Dense Weight: 0.80
Sparse Weight: 0.20
Metrics:
MRR@1: 0.6600
MRR@4: 0.7383
MRR@6: 0.7417
Recall@1: 0.6600
Recall@4: 0.8400
Recall@6: 0.8600

===== Evaluating with best weights =====


Evaluating best weights: 100%|██████████| 328/328 [00:39<00:00,  8.35it/s]


===== Comparison with baseline =====
Metric     | dense_rerank     | fusion           | sparse_rerank    | weighted_optimal | Best from previous experiments
-----------------------------------------------------------------------------------------------------------------------
MRR@1      | 0.0000 ↓ 100.0% | 0.6463 ↑ 3.4% | 0.0000 ↓ 100.0% | 0.7134 ↑ 14.1% | 0.6250 ↓ 0.0%
MRR@4      | 0.0000 ↓ 100.0% | 0.7508 ↓ 3.6% | 0.0000 ↓ 100.0% | 0.7932 ↑ 1.8% | 0.7790 ↓ 0.0%
MRR@6      | 0.0000 ↓ 100.0% | 0.7581 ↓ 3.3% | 0.0000 ↓ 100.0% | 0.8000 ↑ 2.1% | 0.7836 ↓ 0.0%
Recall@1   | 0.0000 ↓ 100.0% | 0.6463 ↓ 9.0% | 0.0000 ↓ 100.0% | 0.7134 ↑ 0.4% | 0.7104 ↓ 0.0%
Recall@4   | 0.0000 ↓ 100.0% | 0.8811 ↑ 0.4% | 0.0000 ↓ 100.0% | 0.8994 ↑ 2.4% | 0.8780 ↓ 0.0%
Recall@6   | 0.0000 ↓ 100.0% | 0.9207 ↑ 2.0% | 0.0000 ↓ 100.0% | 0.9360 ↑ 3.7% | 0.9024 ↓ 0.0%

===== Best Methods =====
MRR@1: weighted_optimal (0.7134, +14.1%)
MRR@4: weighted_optimal (0.7932, +1.8%)
MRR@6: weighted_optimal (0.8000, +2.1%)
Reca


