In [None]:
!pip install torch faiss-cpu sentence-transformers rank_bm25 numpy pandas tqdm matplotlib transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25, faiss-cpu
Successfully installed faiss-cpu-1.12.0 rank_bm25-0.2.2


In [None]:
import os
import time
import json
import math
import random
import tempfile
import argparse
from collections import defaultdict
from functools import partial
from statistics import mean

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Retrieval & modeling libs
from sentence_transformers import SentenceTransformer
import faiss
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set a professional plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# ------------------------------
# Utility: dataset generation
# ------------------------------
def make_limit_dataset(num_docs, vocab_size, doc_length, num_queries, seed=42):
    """
    Build a synthetic dataset with unique relevant documents for queries.
    """
    random.seed(seed)
    np.random.seed(seed)

    # Check for cached dataset
    cache_path = f"dataset_cache_{num_docs}_{vocab_size}_{doc_length}_{num_queries}_{seed}.json"
    if os.path.exists(cache_path):
        print(f"Loading dataset from cache: {cache_path}")
        with open(cache_path, 'r') as f:
            data = json.load(f)
        return data['docs'], data['queries'], data['q_to_relevant_doc']

    print(f"Generating new dataset with {num_docs} docs and {num_queries} queries...")
    vocab = [f"w{i}" for i in range(vocab_size)]
    docs = []
    signature_pool = [f"S{i}" for i in range(int(num_docs * 1.2))]
    random.shuffle(signature_pool)

    for doc_id in range(num_docs):
        tokens = random.choices(vocab, k=max(1, doc_length - 3))
        sign_count = 1 if random.random() < 0.7 else 2
        signature = " ".join(signature_pool[doc_id:doc_id + sign_count])
        tokens += signature.split()
        random.shuffle(tokens)
        docs.append(" ".join(tokens))

    queries = []
    q_to_relevant = []
    for doc_id in range(num_queries):
        doc_tokens = docs[doc_id].split()
        signature_tokens = [t for t in doc_tokens if t.startswith("S")]
        if not signature_tokens:
            signature_tokens = [f"S{doc_id}"]
        distractors = random.sample(vocab, k=10)
        q = " ".join(signature_tokens + distractors[:5])
        queries.append(q)
        q_to_relevant.append(doc_id)

    # Save to cache
    with open(cache_path, 'w') as f:
        json.dump({'docs': docs, 'queries': queries, 'q_to_relevant_doc': q_to_relevant}, f)

    return docs, queries, q_to_relevant

# ------------------------------
# Evaluation metrics
# ------------------------------
def recall_at_k(retrieved_ids, relevant_id, k):
    topk = retrieved_ids[:k]
    return 1.0 if relevant_id in topk else 0.0

def precision_at_k(retrieved_ids, relevant_id, k):
    return 1.0 if relevant_id in retrieved_ids[:k] else 0.0

def mean_reciprocal_rank(retrieved_ids, relevant_id):
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id == relevant_id:
            return 1.0 / (i + 1)
    return 0.0

def average_precision(retrieved_ids, relevant_id):
    if relevant_id not in retrieved_ids:
        return 0.0

    hits = 0
    sum_precisions = 0
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id == relevant_id:
            hits += 1
            sum_precisions += hits / (i + 1.0)
    return sum_precisions / 1.0

# ------------------------------
# Abstract Retriever Classes
# ------------------------------
class BaseRetriever:
    def build_index(self, docs):
        raise NotImplementedError
    def query(self, queries, top_k):
        raise NotImplementedError

class DenseRetriever(BaseRetriever):
    def __init__(self, model_name, device, cache_dir="./embedding_cache"):
        self.model_name = model_name
        self.device = device
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)
        print(f"[DenseRetriever] loading model {model_name} on {self.device}")
        self.model = SentenceTransformer(model_name, device=self.device)
        self.index = None
        self.doc_embeddings = None
        self.doc_count = 0

    def build_index(self, docs, index_factory="IVF1024,Flat", use_gpu=False):
        cache_file = os.path.join(self.cache_dir, f"dense_{len(docs)}.npy")
        if os.path.exists(cache_file):
            print(f"[DenseRetriever] Loading embeddings from cache: {cache_file}")
            self.doc_embeddings = np.load(cache_file)
        else:
            print("[DenseRetriever] encoding docs...")
            self.doc_embeddings = np.array(self.model.encode(docs, show_progress_bar=True, batch_size=256))
            np.save(cache_file, self.doc_embeddings)

        self.doc_count = len(docs)
        d = self.doc_embeddings.shape[1]
        print(f"[DenseRetriever] embeddings shape: {self.doc_embeddings.shape}")

        t0 = time.time()
        print(f"[DenseRetriever] building faiss index - {index_factory}")
        self.index = faiss.index_factory(d, index_factory)
        if use_gpu and faiss.get_num_gpus() > 0:
            res = faiss.StandardGpuResources()
            self.index = faiss.index_cpu_to_gpu(res, 0, self.index)

        if not self.index.is_trained:
            print("[DenseRetriever] training faiss index ...")
            self.index.train(self.doc_embeddings)
        self.index.add(self.doc_embeddings)
        print(f"[DenseRetriever] added {self.index.ntotal} vectors")

        indexing_time = time.time() - t0
        index_size_mb = self.index.ntotal * self.doc_embeddings.itemsize * self.doc_embeddings.shape[1] / (1024 * 1024)
        return indexing_time, index_size_mb

    def query(self, queries, top_k=100):
        q_emb = np.array(self.model.encode(queries, show_progress_bar=False))
        D, I = self.index.search(q_emb, top_k)
        return I.tolist(), D.tolist()

class MultiVectorRetriever(BaseRetriever):
    def __init__(self, model_name, device, chunk_size, cache_dir="./embedding_cache"):
        self.chunk_size = chunk_size
        self.model = SentenceTransformer(model_name, device=device)
        self.index = None
        self.vector_to_doc = []
        self.doc_count = 0
        self.vectors = None
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

    def _chunk_doc(self, doc):
        tokens = doc.split()
        chunks = []
        for i in range(0, len(tokens), self.chunk_size):
            chunks.append(" ".join(tokens[i:i + self.chunk_size]))
        return chunks if chunks else [doc]

    def build_index(self, docs, index_factory="IVF1024,Flat"):
        cache_file = os.path.join(self.cache_dir, f"multivector_{len(docs)}.npy")
        map_cache_file = os.path.join(self.cache_dir, f"multivector_map_{len(docs)}.npy")

        if os.path.exists(cache_file) and os.path.exists(map_cache_file):
            print(f"[MultiVector] Loading embeddings from cache: {cache_file}")
            self.vectors = np.load(cache_file)
            self.vector_to_doc = np.load(map_cache_file).tolist()
        else:
            print("[MultiVector] creating chunks and embeddings...")
            all_chunks = []
            self.vector_to_doc = []
            for doc_id, doc in enumerate(docs):
                chunks = self._chunk_doc(doc)
                for ch in chunks:
                    all_chunks.append(ch)
                    self.vector_to_doc.append(doc_id)

            self.vectors = np.array(self.model.encode(all_chunks, show_progress_bar=True, batch_size=256))
            np.save(cache_file, self.vectors)
            np.save(map_cache_file, np.array(self.vector_to_doc))

        d = self.vectors.shape[1]
        t0 = time.time()
        self.index = faiss.index_factory(d, index_factory)
        if not self.index.is_trained:
            self.index.train(self.vectors)
        self.index.add(self.vectors)
        self.doc_count = len(docs)
        print(f"[MultiVector] added {self.index.ntotal} vectors")

        indexing_time = time.time() - t0
        index_size_mb = self.index.ntotal * self.vectors.itemsize * self.vectors.shape[1] / (1024 * 1024)
        return indexing_time, index_size_mb

    def query(self, queries, top_k=200):
        q_emb = np.array(self.model.encode(queries, show_progress_bar=False))
        D, I = self.index.search(q_emb, top_k)
        results = []
        for row in I:
            doc_scores = {}
            for vid in row:
                if vid < 0:
                    continue
                doc_id = self.vector_to_doc[vid]
                doc_scores.setdefault(doc_id, 0)
                doc_scores[doc_id] += 1
            ranked_docs = [doc for doc, _ in sorted(doc_scores.items(), key=lambda x: -x[1])]
            results.append(ranked_docs)
        return results

class BM25Retriever(BaseRetriever):
    def __init__(self):
        self.bm25 = None
        self.docs_tokenized = None

    def build_index(self, docs):
        t0 = time.time()
        tokenized = [doc.split() for doc in docs]
        self.docs_tokenized = tokenized
        self.bm25 = BM25Okapi(tokenized)
        indexing_time = time.time() - t0
        avg_doc_len = np.mean([len(doc) for doc in tokenized])
        index_size_mb = len(docs) * avg_doc_len * 4 / (1024 * 1024)
        return indexing_time, index_size_mb

    def query(self, queries, top_k=100):
        all_ids = []
        for q in tqdm(queries, desc="BM25 Querying"):
            q_tok = q.split()
            scores = self.bm25.get_scores(q_tok)
            top_idxs = np.argsort(scores)[::-1][:top_k]
            all_ids.append(top_idxs.tolist())
        return all_ids

# ------------------------------
# Cross-encoder re-ranker
# ------------------------------
class CrossEncoderReranker:
    def __init__(self, model, device):
        self.device = device
        print(f"[CrossEncoder] loading {model} on {self.device}")
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModelForSequenceClassification.from_pretrained(model).to(self.device)

    def score_pairs(self, queries, candidate_texts, batch_size=32):
        results = []
        for q, cands in tqdm(zip(queries, candidate_texts), total=len(queries), desc="Reranking"):
            pairs = [(q, cand) for cand in cands]
            scores = []
            for i in range(0, len(pairs), batch_size):
                batch = pairs[i:i+batch_size]
                texts_a = [p[0] for p in batch]
                texts_b = [p[1] for p in batch]
                encoded = self.tokenizer(texts_a, texts_b, padding=True, truncation=True, return_tensors="pt").to(self.device)
                with torch.no_grad():
                    out = self.model(**encoded)
                    logits = out.logits.squeeze(-1).cpu().numpy()
                    scores.extend(logits.tolist())
            ranked = [idx for idx, _ in sorted(enumerate(scores), key=lambda x: -x[1])]
            results.append(ranked)
        return results

# ------------------------------
# Main Experimental Framework
# ------------------------------
def run_experiments(config, device):
    os.makedirs(config.save_dir, exist_ok=True)
    rows = []

    device_name = "GPU" if device == "cuda" else "CPU"
    print(f"Running experiments on {device_name}")

    for corpus_size in config.corpus_sizes:
        print(f"\n=== Running corpus size {corpus_size} ===")
        docs, all_queries, q_to_relevant = make_limit_dataset(
            num_docs=corpus_size,
            vocab_size=config.vocab_size,
            doc_length=config.doc_length,
            num_queries=config.num_queries
        )

        queries = all_queries[:config.num_queries]
        q_to_relevant = q_to_relevant[:config.num_queries]

        # BM25 Experiment
        bm25 = BM25Retriever()
        index_time, index_size = bm25.build_index(docs)
        retrieved_ids = bm25.query(queries, top_k=max(config.topk_list))
        stats = evaluate_retrieval("bm25", retrieved_ids, docs, queries, q_to_relevant, config.topk_list)
        stats.update({"corpus_size": corpus_size, "index_time_sec": index_time, "index_size_mb": index_size, "device": device_name})
        rows.append(stats)
        print("BM25 done:", stats)

        # Dense Retrieval Experiment
        dense = DenseRetriever(model_name=config.model_name, device=device)
        if corpus_size >= 100000:
            idx_factory = "IVF4096,Flat"
        elif corpus_size >= 50000:
            idx_factory = "IVF1024,Flat"
        else:
            idx_factory = "Flat"
        index_time, index_size = dense.build_index(docs, index_factory=idx_factory, use_gpu=(device=="cuda"))
        retrieved_ids_dense, _ = dense.query(queries, top_k=max(config.topk_list))
        stats = evaluate_retrieval("dense_faiss", retrieved_ids_dense, docs, queries, q_to_relevant, config.topk_list)
        stats.update({"corpus_size": corpus_size, "embedding_dim": dense.doc_embeddings.shape[1], "index_factory": idx_factory, "index_time_sec": index_time, "index_size_mb": index_size, "device": device_name})
        rows.append(stats)
        print("Dense done:", stats)

        # Multi-vector Experiment
        mvr = MultiVectorRetriever(model_name=config.model_name, device=device, chunk_size=config.chunk_size)
        index_time, index_size = mvr.build_index(docs, index_factory=idx_factory)
        retrieved_ids_mvr = mvr.query(queries, top_k=max(config.topk_list))
        stats = evaluate_retrieval("multi_vector", retrieved_ids_mvr, docs, queries, q_to_relevant, config.topk_list)
        stats.update({"corpus_size": corpus_size, "chunk_size": config.chunk_size, "index_time_sec": index_time, "index_size_mb": index_size, "device": device_name})
        rows.append(stats)
        print("Multi-vector done:", stats)

        # Dense with Cross-encoder Reranking Experiment
        if config.use_cross_encoder:
            cross_reranker = CrossEncoderReranker(model=config.cross_model, device=device)
            candidate_ids, _ = dense.query(queries, top_k=100)

            candidate_texts = [[docs[cid] for cid in row if isinstance(cid, int)] for row in candidate_ids]

            start_rerank = time.time()
            reranked_positions = cross_reranker.score_pairs(queries, candidate_texts)
            rerank_lat = (time.time() - start_rerank) / len(queries)

            mapping = {docs[i]: i for i in range(len(docs))}
            reranked_doc_ids = []
            for original_cands, ranks in zip(candidate_texts, reranked_positions):
                ordered = [mapping[original_cands[i]] for i in ranks]
                reranked_doc_ids.append(ordered)

            stats = evaluate_retrieval("dense_cross_rerank", reranked_doc_ids, docs, queries, q_to_relevant, config.topk_list)
            stats.update({"corpus_size": corpus_size, "avg_rerank_latency_sec": rerank_lat, "device": device_name})
            rows.append(stats)
            print("Cross-encoder done:", stats)

        df = pd.DataFrame(rows)
        csv_path = os.path.join(config.save_dir, "results_intermediate.csv")
        df.to_csv(csv_path, index=False)
        print(f"[Main] intermediate results saved to {csv_path}")

    final_csv = os.path.join(config.save_dir, "results_final.csv")
    pd.DataFrame(rows).to_csv(final_csv, index=False)
    print(f"[Main] final results saved to {final_csv}")
    return rows

def evaluate_retrieval(method_name, retrieved_ids, docs, queries, q_to_relevant, topk_list):
    """Evaluates a single retrieval method."""
    qlat = 0.0 # This needs to be calculated in the query() method.
    stats = {"method": method_name, "num_queries": len(queries)}

    recall = {k: [] for k in topk_list}
    precision = {k: [] for k in topk_list}
    mrr_list = []
    ap_list = []

    for q_idx, rlist in enumerate(retrieved_ids):
        rel = q_to_relevant[q_idx]
        if not rlist: # Handle empty results
            for k in topk_list:
                recall[k].append(0.0)
                precision[k].append(0.0)
            mrr_list.append(0.0)
            ap_list.append(0.0)
            continue

        mrr_list.append(mean_reciprocal_rank(rlist, rel))
        ap_list.append(average_precision(rlist, rel))
        for k in topk_list:
            recall[k].append(recall_at_k(rlist, rel, k))
            precision[k].append(precision_at_k(rlist, rel, k))

    for k in topk_list:
        stats[f"recall@{k}"] = float(np.mean(recall[k]))
        stats[f"precision@{k}"] = float(np.mean(precision[k]))
    stats["mrr"] = float(np.mean(mrr_list))
    stats["map"] = float(np.mean(ap_list))

    return stats


def plot_results(csv_path, save_dir):
    df = pd.read_csv(csv_path)
    os.makedirs(save_dir, exist_ok=True)

    # Plotting Recall@10
    pivot = df.pivot_table(index="corpus_size", columns="method", values="recall@10")
    pivot.plot(marker="o", title="Recall@10 vs Corpus Size")
    plt.xlabel("Corpus Size")
    plt.ylabel("Recall@10")
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "recall_at_10.png"))
    plt.close()

    # Plotting MRR
    pivot_mrr = df.pivot_table(index="corpus_size", columns="method", values="mrr")
    pivot_mrr.plot(marker="o", title="MRR vs Corpus Size")
    plt.xlabel("Corpus Size")
    plt.ylabel("MRR")
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "mrr.png"))
    plt.close()

    # Plotting MAP
    pivot_map = df.pivot_table(index="corpus_size", columns="method", values="map")
    pivot_map.plot(marker="o", title="MAP vs Corpus Size")
    plt.xlabel("Corpus Size")
    plt.ylabel("MAP")
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "map.png"))
    plt.close()

    # Plotting Indexing Time
    pivot_idx_time = df.pivot_table(index="corpus_size", columns="method", values="index_time_sec")
    pivot_idx_time.plot(marker="o", title="Indexing Time (s)")
    plt.xlabel("Corpus Size")
    plt.ylabel("Seconds")
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "indexing_time.png"))
    plt.close()

    # Plotting Index Size
    pivot_idx_size = df.pivot_table(index="corpus_size", columns="method", values="index_size_mb")
    pivot_idx_size.plot(marker="o", title="Index Size (MB)")
    plt.xlabel("Corpus Size")
    plt.ylabel("Megabytes")
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "index_size.png"))
    plt.close()

    print(f"[plot_results] saved plots to {save_dir}")

def parse_args():
    parser = argparse.ArgumentParser(description="Run a set of retrieval experiments on a synthetic dataset.")
    parser.add_argument("--corpus_sizes", nargs="+", type=int, default=[1000, 5000, 20000],
                        help="Corpus sizes to run experiments on. e.g., --corpus_sizes 1000 5000")
    parser.add_argument("--num_queries", type=int, default=100,
                        help="Number of queries to use for evaluation.")
    parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2",
                        help="Sentence-Transformer model to use for dense and multi-vector methods.")
    parser.add_argument("--cross_model", type=str, default="cross-encoder/ms-marco-MiniLM-L-6-v2",
                        help="Cross-encoder model for re-ranking.")
    parser.add_argument("--use_cross_encoder", action="store_true",
                        help="Whether to run the cross-encoder re-ranking experiment.")
    parser.add_argument("--save_dir", type=str, default="./experiment_results_alice",
                        help="Directory to save results and plots.")
    parser.add_argument("--doc_length", type=int, default=32,
                        help="Length of each synthetic document.")
    parser.add_argument("--vocab_size", type=int, default=20000,
                        help="Size of the synthetic vocabulary.")
    parser.add_argument("--chunk_size", type=int, default=8,
                        help="Chunk size for the multi-vector method.")
    parser.add_argument("--topk_list", nargs="+", type=int, default=[1, 2, 5, 10, 100],
                        help="List of k values for recall@k evaluation.")
    return parser.parse_args()

def main():
    # Use a simple object to hold your arguments, mimicking argparse
    class Args:
        def __init__(self):
            self.corpus_sizes = [500, 1000, 5000,10000,50000, 100000]
            self.num_queries = 100
            self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
            self.cross_model = "cross-encoder/ms-marco-MiniLM-L-6-v2"
            self.use_cross_encoder = True
            self.save_dir = "./experiment_results_alice"
            self.doc_length = 32
            self.vocab_size = 20000
            self.chunk_size = 8
            self.topk_list = [1, 2, 5, 10, 100]

    args = Args()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    rows = run_experiments(args, device)
    plot_results(os.path.join(args.save_dir, "results_final.csv"), args.save_dir)
    print("Done. Results saved under:", args.save_dir)

if __name__ == "__main__":
    main()


Running experiments on GPU

=== Running corpus size 500 ===
Generating new dataset with 500 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.55, 'precision@1': 0.55, 'recall@2': 0.74, 'precision@2': 0.74, 'recall@5': 1.0, 'precision@5': 1.0, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.7191666666666666, 'map': 0.7191666666666666, 'corpus_size': 500, 'index_time_sec': 0.009723186492919922, 'index_size_mb': np.float64(0.057811737060546875), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[DenseRetriever] encoding docs...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (500, 384)
[DenseRetriever] building faiss index - Flat
[DenseRetriever] added 500 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.01, 'precision@1': 0.01, 'recall@2': 0.02, 'precision@2': 0.02, 'recall@5': 0.04, 'precision@5': 0.04, 'recall@10': 0.06, 'precision@10': 0.06, 'recall@100': 0.35, 'precision@100': 0.35, 'mrr': 0.032538946291533505, 'map': 0.032538946291533505, 'corpus_size': 500, 'embedding_dim': 384, 'index_factory': 'Flat', 'index_time_sec': 0.0004248619079589844, 'index_size_mb': 0.732421875, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

[MultiVector] added 2000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.02, 'precision@1': 0.02, 'recall@2': 0.04, 'precision@2': 0.04, 'recall@5': 0.19, 'precision@5': 0.19, 'recall@10': 0.31, 'precision@10': 0.31, 'recall@100': 0.7, 'precision@100': 0.7, 'mrr': 0.09591523943945049, 'map': 0.09591523943945049, 'corpus_size': 500, 'chunk_size': 8, 'index_time_sec': 0.0006067752838134766, 'index_size_mb': 2.9296875, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.06, 'precision@1': 0.06, 'recall@2': 0.06, 'precision@2': 0.06, 'recall@5': 0.09, 'precision@5': 0.09, 'recall@10': 0.13, 'precision@10': 0.13, 'recall@100': 0.35, 'precision@100': 0.35, 'mrr': 0.079017373099663, 'map': 0.079017373099663, 'corpus_size': 500, 'avg_rerank_latency_sec': 0.13699365854263307, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv

=== Running corpus size 1000 ===
Generating new dataset with 1000 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.5, 'precision@1': 0.5, 'recall@2': 0.75, 'precision@2': 0.75, 'recall@5': 0.98, 'precision@5': 0.98, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.6936666666666667, 'map': 0.6936666666666667, 'corpus_size': 1000, 'index_time_sec': 0.018627643585205078, 'index_size_mb': np.float64(0.11550521850585938), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda
[DenseRetriever] encoding docs...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (1000, 384)
[DenseRetriever] building faiss index - Flat
[DenseRetriever] added 1000 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.02, 'precision@1': 0.02, 'recall@2': 0.03, 'precision@2': 0.03, 'recall@5': 0.05, 'precision@5': 0.05, 'recall@10': 0.05, 'precision@10': 0.05, 'recall@100': 0.13, 'precision@100': 0.13, 'mrr': 0.03474001875317665, 'map': 0.03474001875317665, 'corpus_size': 1000, 'embedding_dim': 384, 'index_factory': 'Flat', 'index_time_sec': 0.0006158351898193359, 'index_size_mb': 1.46484375, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

[MultiVector] added 4000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.03, 'precision@1': 0.03, 'recall@2': 0.1, 'precision@2': 0.1, 'recall@5': 0.18, 'precision@5': 0.18, 'recall@10': 0.23, 'precision@10': 0.23, 'recall@100': 0.44, 'precision@100': 0.44, 'mrr': 0.09960208544225754, 'map': 0.09960208544225754, 'corpus_size': 1000, 'chunk_size': 8, 'index_time_sec': 0.0013833045959472656, 'index_size_mb': 5.859375, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.03, 'precision@1': 0.03, 'recall@2': 0.04, 'precision@2': 0.04, 'recall@5': 0.05, 'precision@5': 0.05, 'recall@10': 0.05, 'precision@10': 0.05, 'recall@100': 0.13, 'precision@100': 0.13, 'mrr': 0.04039614156780814, 'map': 0.04039614156780814, 'corpus_size': 1000, 'avg_rerank_latency_sec': 0.17922500848770143, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv

=== Running corpus size 5000 ===
Generating new dataset with 5000 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.93, 'precision@1': 0.93, 'recall@2': 0.98, 'precision@2': 0.98, 'recall@5': 1.0, 'precision@5': 1.0, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.9616666666666666, 'map': 0.9616666666666666, 'corpus_size': 5000, 'index_time_sec': 0.06919598579406738, 'index_size_mb': np.float64(0.5779838562011719), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda
[DenseRetriever] encoding docs...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (5000, 384)
[DenseRetriever] building faiss index - Flat
[DenseRetriever] added 5000 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.01, 'precision@5': 0.01, 'recall@10': 0.02, 'precision@10': 0.02, 'recall@100': 0.05, 'precision@100': 0.05, 'mrr': 0.005360724041136412, 'map': 0.005360724041136412, 'corpus_size': 5000, 'embedding_dim': 384, 'index_factory': 'Flat', 'index_time_sec': 0.0012903213500976562, 'index_size_mb': 7.32421875, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

[MultiVector] added 20000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.02, 'precision@1': 0.02, 'recall@2': 0.07, 'precision@2': 0.07, 'recall@5': 0.11, 'precision@5': 0.11, 'recall@10': 0.14, 'precision@10': 0.14, 'recall@100': 0.29, 'precision@100': 0.29, 'mrr': 0.06643031839922188, 'map': 0.06643031839922188, 'corpus_size': 5000, 'chunk_size': 8, 'index_time_sec': 0.02208256721496582, 'index_size_mb': 29.296875, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.01, 'precision@1': 0.01, 'recall@2': 0.01, 'precision@2': 0.01, 'recall@5': 0.03, 'precision@5': 0.03, 'recall@10': 0.03, 'precision@10': 0.03, 'recall@100': 0.05, 'precision@100': 0.05, 'mrr': 0.015569908814589663, 'map': 0.015569908814589663, 'corpus_size': 5000, 'avg_rerank_latency_sec': 0.14581366300582885, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv

=== Running corpus size 10000 ===
Generating new dataset with 10000 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.83, 'precision@1': 0.83, 'recall@2': 0.99, 'precision@2': 0.99, 'recall@5': 1.0, 'precision@5': 1.0, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.9133333333333333, 'map': 0.9133333333333333, 'corpus_size': 10000, 'index_time_sec': 0.13577628135681152, 'index_size_mb': np.float64(1.1557807922363281), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda
[DenseRetriever] encoding docs...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (10000, 384)
[DenseRetriever] building faiss index - Flat
[DenseRetriever] added 10000 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.01, 'precision@2': 0.01, 'recall@5': 0.02, 'precision@5': 0.02, 'recall@10': 0.02, 'precision@10': 0.02, 'recall@100': 0.06, 'precision@100': 0.06, 'mrr': 0.009931625781625781, 'map': 0.009931625781625781, 'corpus_size': 10000, 'embedding_dim': 384, 'index_factory': 'Flat', 'index_time_sec': 0.004030466079711914, 'index_size_mb': 14.6484375, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

[MultiVector] added 40000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.02, 'precision@1': 0.02, 'recall@2': 0.04, 'precision@2': 0.04, 'recall@5': 0.05, 'precision@5': 0.05, 'recall@10': 0.1, 'precision@10': 0.1, 'recall@100': 0.22, 'precision@100': 0.22, 'mrr': 0.04336699797116464, 'map': 0.04336699797116464, 'corpus_size': 10000, 'chunk_size': 8, 'index_time_sec': 0.047554731369018555, 'index_size_mb': 58.59375, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.01, 'precision@5': 0.01, 'recall@10': 0.03, 'precision@10': 0.03, 'recall@100': 0.06, 'precision@100': 0.06, 'mrr': 0.006036647992530345, 'map': 0.006036647992530345, 'corpus_size': 10000, 'avg_rerank_latency_sec': 0.17221628189086913, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv

=== Running corpus size 50000 ===
Generating new dataset with 50000 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.58, 'precision@1': 0.58, 'recall@2': 0.81, 'precision@2': 0.81, 'recall@5': 1.0, 'precision@5': 1.0, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.752, 'map': 0.752, 'corpus_size': 50000, 'index_time_sec': 0.6617116928100586, 'index_size_mb': np.float64(5.779293060302734), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda
[DenseRetriever] encoding docs...


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (50000, 384)
[DenseRetriever] building faiss index - IVF1024,Flat
[DenseRetriever] training faiss index ...
[DenseRetriever] added 50000 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.01, 'precision@2': 0.01, 'recall@5': 0.01, 'precision@5': 0.01, 'recall@10': 0.01, 'precision@10': 0.01, 'recall@100': 0.03, 'precision@100': 0.03, 'mrr': 0.005768115942028985, 'map': 0.005768115942028985, 'corpus_size': 50000, 'embedding_dim': 384, 'index_factory': 'IVF1024,Flat', 'index_time_sec': 5.191736459732056, 'index_size_mb': 73.2421875, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/782 [00:00<?, ?it/s]

[MultiVector] added 200000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.0, 'precision@5': 0.0, 'recall@10': 0.0, 'precision@10': 0.0, 'recall@100': 0.04, 'precision@100': 0.04, 'mrr': 0.0008327166099281133, 'map': 0.0008327166099281133, 'corpus_size': 50000, 'chunk_size': 8, 'index_time_sec': 16.671342372894287, 'index_size_mb': 292.96875, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.01, 'precision@1': 0.01, 'recall@2': 0.02, 'precision@2': 0.02, 'recall@5': 0.02, 'precision@5': 0.02, 'recall@10': 0.02, 'precision@10': 0.02, 'recall@100': 0.03, 'precision@100': 0.03, 'mrr': 0.015434782608695652, 'map': 0.015434782608695652, 'corpus_size': 50000, 'avg_rerank_latency_sec': 0.14179737091064454, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv

=== Running corpus size 100000 ===
Generating new dataset with 100000 docs and 100 queries...


BM25 Querying:   0%|          | 0/100 [00:00<?, ?it/s]

BM25 done: {'method': 'bm25', 'num_queries': 100, 'recall@1': 0.28, 'precision@1': 0.28, 'recall@2': 0.45, 'precision@2': 0.45, 'recall@5': 0.93, 'precision@5': 0.93, 'recall@10': 1.0, 'precision@10': 1.0, 'recall@100': 1.0, 'precision@100': 1.0, 'mrr': 0.5127142857142857, 'map': 0.5127142857142857, 'corpus_size': 100000, 'index_time_sec': 2.5277209281921387, 'index_size_mb': np.float64(11.559150695800781), 'device': 'GPU'}
[DenseRetriever] loading model sentence-transformers/all-MiniLM-L6-v2 on cuda
[DenseRetriever] encoding docs...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

[DenseRetriever] embeddings shape: (100000, 384)
[DenseRetriever] building faiss index - IVF4096,Flat
[DenseRetriever] training faiss index ...
[DenseRetriever] added 100000 vectors
Dense done: {'method': 'dense_faiss', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.0, 'precision@5': 0.0, 'recall@10': 0.0, 'precision@10': 0.0, 'recall@100': 0.0, 'precision@100': 0.0, 'mrr': 0.0, 'map': 0.0, 'corpus_size': 100000, 'embedding_dim': 384, 'index_factory': 'IVF4096,Flat', 'index_time_sec': 32.355623960494995, 'index_size_mb': 146.484375, 'device': 'GPU'}
[MultiVector] creating chunks and embeddings...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

[MultiVector] added 400000 vectors
Multi-vector done: {'method': 'multi_vector', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.0, 'precision@5': 0.0, 'recall@10': 0.0, 'precision@10': 0.0, 'recall@100': 0.05, 'precision@100': 0.05, 'mrr': 0.002155832106490001, 'map': 0.002155832106490001, 'corpus_size': 100000, 'chunk_size': 8, 'index_time_sec': 169.8936150074005, 'index_size_mb': 585.9375, 'device': 'GPU'}
[CrossEncoder] loading cross-encoder/ms-marco-MiniLM-L-6-v2 on cuda


Reranking:   0%|          | 0/100 [00:00<?, ?it/s]

Cross-encoder done: {'method': 'dense_cross_rerank', 'num_queries': 100, 'recall@1': 0.0, 'precision@1': 0.0, 'recall@2': 0.0, 'precision@2': 0.0, 'recall@5': 0.0, 'precision@5': 0.0, 'recall@10': 0.0, 'precision@10': 0.0, 'recall@100': 0.0, 'precision@100': 0.0, 'mrr': 0.0, 'map': 0.0, 'corpus_size': 100000, 'avg_rerank_latency_sec': 0.1452486777305603, 'device': 'GPU'}
[Main] intermediate results saved to ./experiment_results_alice/results_intermediate.csv
[Main] final results saved to ./experiment_results_alice/results_final.csv
[plot_results] saved plots to ./experiment_results_alice
Done. Results saved under: ./experiment_results_alice
