## Dense + Sparse: Hybrid Retrieval
uses results from dense and sparse retrieval to improve performance

## Load Dataset

In [None]:
# load qrels from MS MARCO
from datasets import load_dataset
# load MS MARCO dataset (use 10% of the dataset for testing)
docs_dataset = load_dataset("sentence-transformers/msmarco", "corpus", split="train[:10%]")
queries_dataset = load_dataset("sentence-transformers/msmarco", "queries", split="train[:10%]")
qrels_dataset = load_dataset("sentence-transformers/msmarco", "labeled-list", split="train[:10%]")

print("corpus labels:", docs_dataset.column_names)
print("queries labels:", queries_dataset.column_names)
print("qrels labels:", qrels_dataset.column_names)

print("corpus:", docs_dataset[0])
print("queries:", queries_dataset[0])
print("qrels:", qrels_dataset[0])

In [None]:
from collections import defaultdict

# convert to list for faster processing
docs = list(docs_dataset)
queries = list(queries_dataset)
qrels = list(qrels_dataset)

#  Normalize helper
def clean(text):
    return text.strip().lower()

# extract document and query IDs + texts
doc_ids, doc_texts = zip(*[(d["passage_id"], clean(d["passage"])) for d in docs])
query_ids, query_texts = zip(*[(q["query_id"], clean(q["query"])) for q in queries])

# prepare qrels dict for ranx
doc_id_set = set(doc_ids)
query_id_set = set(query_ids)

qrels_dict = defaultdict(dict)
for row in qrels:
    qid = row["query_id"]
    for doc_id, label in zip(row["doc_ids"], row["labels"]):
        if qid in query_id_set and doc_id in doc_id_set and label > 0:
            qrels_dict[qid][doc_id] = label

## Load Results from Dense and Sparse Retrieval
Results should follow format:
```
{
    "query_id": {
        "doc_id": {
            "score": 0.9
        }
    }
}
```

In [None]:
from ranx import Run

# Load dense and sparse results from JSON
dense_run = Run.from_file("dense_results.json")
sparse_run = Run.from_file("sparse_results.json")  # if available

## Apply Multiple Fusion Strategies + Evaluate

In [None]:
from ranx import fuse, evaluate
import time

# Define fusion strategies and their config
fusion_strategies = {
    "rrf": {"method": "rrf"},
    "wsum": {"method": "wsum", "weights": [0.5, 0.5]},
}

# Store fused runs and timing
fused_runs = {}
timings = {}

# Time each fusion
for strategy, params in fusion_strategies.items():
    print(f"==== RUN: {strategy.upper()} ====")

    start_time = time.time()
    fused_run = fuse([sparse_run, dense_run], **params)
    end_time = time.time()

    fused_runs[strategy] = fused_run
    timings[strategy] = end_time - start_time
    retrieval_time = (end_time - start_time) / len(fused_run.run)

    print(f"Retrieval time per query: {retrieval_time:.4f} seconds")

In [None]:
# calculate MRR for each fusion strategy
for strategy, fused_run in fused_runs.items():
    mrr = evaluate(qrels_dict, fused_run, "mrr", make_comparable=True)
    print(f"MRR for {strategy}: {mrr:.4f}")