# Hybrid search

Once all the documents are already indexed, we can finally start doing our searches.

In [1]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")
client.count("scifact")

CountResult(count=5183)

In [2]:
from fastembed.embedding import TextEmbedding
from fastembed.sparse.bm25 import Bm25
from fastembed.late_interaction import LateInteractionTextEmbedding

dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
bm25_embedding_model = Bm25("Qdrant/bm25")
late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
query_text = "What is the impact of COVID-19 on the environment?"

The new Query API of Qdrant 1.10+ unifies all the operations that might be done on a collection.

In [4]:
# Please notice that we started using a brand new .query_points method.
# Before Qdrant 1.10 we used .query method instead.
client.query_points(
    "scifact",
    query=next(dense_embedding_model.query_embed(query_text)),
    using="all-MiniLM-L6-v2",
    limit=10,
    with_payload=False,
)

QueryResponse(points=[ScoredPoint(id=13882658, version=1277, score=0.38743758, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=2097256, version=808, score=0.36370263, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=22401061, version=1538, score=0.3489025, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=7485455, version=1049, score=0.33988652, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=11936877, version=1207, score=0.3381256, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=3716075, version=876, score=0.33763093, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=27453479, version=1708, score=0.3360035, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=25953438, version=1671, score=0.3300631, payload=None, vector=None, shard_key=None, order_value=None), ScoredPoint(id=13770184, version=12

## Benchmarking

BeIR SciFact isn't designed for the demo purposes, but in order to benchmark how different methods deal with the same task, we can use it as a reference. Let's load the ground truth and evaluate the performance of various search pipelines.

In [5]:
from datasets import load_dataset

queries = load_dataset("BeIR/scifact", "queries", split="queries")
len(queries)

1109

In [6]:
query_qrels = load_dataset("BeIR/scifact-qrels", split="train")
len(query_qrels)

919

In [7]:
query_qrels[0]

{'query-id': 0, 'corpus-id': 31715818, 'score': 1}

### Building the ground truth dataset

The ground truth is a dataset of the queries with their best matches. Each of the matches needs a relevancy measure. In the simplest case that might be just binary information

In [8]:
for entry in query_qrels:
    print(entry)
    break

{'query-id': 0, 'corpus-id': 31715818, 'score': 1}


In [9]:
from ranx import Qrels
from collections import defaultdict

qrels_dict = defaultdict(dict)
for entry in query_qrels:
    query_id = str(entry["query-id"])
    doc_id = str(entry["corpus-id"])
    qrels_dict[query_id][doc_id] = entry["score"]

qrels = Qrels(qrels_dict, name="scifact")
qrels

DictType[unicode_type,DictType[[unichr x 9],int64]<iv=None>]<iv=None>({0: {31715818: 1}, 10: {32587939: 1}, 1000: {16472469: 1}, 1001: {5702790: 1}, 1002: {13639330: 1}, 1003: {14332945: 1, 4319844: 1, 4899981: 1}, 1004: {301838: 1, 2734421: 1, 3952288: 1}, 1005: {301838: 1, 2734421: 1, 3952288: 1}, 1006: {4926049: 1}, 1008: {2547636: 1}, 1009: {1982286: 1}, 1011: {9745001: 1}, 1015: {6277638: 1}, 1016: {6277638: 1}, 1018: {11603066: 1}, 1023: {16927286: 1}, 1025: {32408470: 1}, 1026: {3113630: 1}, 1027: {3113630: 1}, 1028: {13923140: 1, 11899391: 1}, 1030: {6441369: 1}, 1031: {12486491: 1}, 1032: {6836086: 1}, 1033: {6836086: 1}, 1034: {4547102: 1}, 1035: {4547102: 1}, 1036: {4547102: 1}, 1037: {16287725: 1}, 1038: {16287725: 1}, 104: {40164383: 1}, 1040: {25254425: 1, 16626264: 1}, 1042: {17421851: 1}, 1043: {17671145: 1}, 1044: {22500262: 1}, 1045: {22500262: 1}, 1046: {418246: 1, 4324278: 1, 16712164: 1}, 1047: {14706752: 1}, 1048: {12486491: 1}, 105: {36606083: 1}, 1050: {19878070

## Precalculating the embeddings

We'll test various hybrid pipelines, so it is a good idea to precompute all the query vectors beforehand.

In [10]:
import tqdm

dense_vectors, sparse_vectors, late_vectors = [], [], []
for query in tqdm.tqdm(queries):
    dense_query_vector = next(dense_embedding_model.query_embed(query["text"]))
    sparse_query_vector = next(bm25_embedding_model.query_embed(query["text"]))
    late_query_vector = next(late_interaction_embedding_model.query_embed(query["text"]))

    dense_vectors.append(dense_query_vector)
    sparse_vectors.append(sparse_query_vector)
    late_vectors.append(late_query_vector)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1109/1109 [01:22<00:00, 13.38it/s]


## Testing various search pipelines

The ground truth dataset is ready, so we can start with calculating the effectiveness of each of our search methods separately.

### Dense embeddings

In [11]:
from ranx import Run

run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])
    
    query_vector = dense_vectors[query_idx]
    
    results = client.query_points(
        "scifact",
        query=query_vector,
        using="all-MiniLM-L6-v2",
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
dense_run = Run(run_dict, name="all-MiniLM-L6-v2")

In [12]:
from ranx import evaluate

evaluate(qrels, dense_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.08899876390605686, 'mrr@10': 0.6196303490493849}

### Sparse embeddings

In [13]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])
    
    query_vector = sparse_vectors[query_idx]
    
    results = client.query_points(
        "scifact",
        query=models.SparseVector(**query_vector.as_object()),
        using="bm25",
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
bm25_run = Run(run_dict, name="bm25")
evaluate(qrels, bm25_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.07935723114956736, 'mrr@10': 0.5752084682245374}

### Late interaction model

In [14]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])
    
    query_vector = late_vectors[query_idx]
    
    results = client.query_points(
        "scifact",
        query=query_vector,
        using="colbertv2.0",
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
colbert_run = Run(run_dict, name="colbert")
evaluate(qrels, colbert_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.09023485784919653, 'mrr@10': 0.6650636686483411}

Late interaction model is the slowest option out there. That's why it's usually used as a reranking component, not a standalone mechanism.

### Reciprocal Rank Fusion

### Dense & sparse only

In [15]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=20,
        ),
    ]
    results = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF,
        ),
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
rrf_run = Run(run_dict, name="rrf")
evaluate(qrels, rrf_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.0903584672435105, 'mrr@10': 0.6509211842957208}

### All the methods in parallel

In [16]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]
    late_query_vector = late_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=20,
        ),
        models.Prefetch(
            query=late_query_vector,
            using="colbertv2.0",
            limit=20,
        ),
    ]
    results = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=models.FusionQuery(
            fusion=models.Fusion.RRF,
        ),
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
full_rrf_run = Run(run_dict, name="full_rrf")
evaluate(qrels, full_rrf_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.09406674907292954, 'mrr@10': 0.692256754370475}

## Reranking with late interaction model

In [17]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]
    late_query_vector = late_vectors[query_idx]

    prefetch = [
        models.Prefetch(
            query=dense_query_vector,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_query_vector.as_object()),
            using="bm25",
            limit=20,
        ),
    ]
    results = client.query_points(
        "scifact",
        prefetch=prefetch,
        query=late_query_vector,
        using="colbertv2.0",
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
reranking_run = Run(run_dict, name="reranking")
evaluate(qrels, reranking_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.09245982694684794, 'mrr@10': 0.6767373987089687}

## Multistep retrieval process

In [18]:
run_dict = {}
for query_idx, query in enumerate(queries):
    query_id = str(query["_id"])

    dense_query_vector = dense_vectors[query_idx]
    sparse_query_vector = sparse_vectors[query_idx]
    late_query_vector = late_vectors[query_idx]

    results = client.query_points(
        "scifact",
        prefetch=[
            models.Prefetch(
                prefetch=[
                    models.Prefetch(
                        query=dense_query_vector,
                        using="all-MiniLM-L6-v2",
                        limit=100,
                    )
                ],
                query=models.SparseVector(**sparse_query_vector.as_object()),
                using="bm25",
                limit=50,
            ),
        ],
        query=late_query_vector,
        using="colbertv2.0",
        with_payload=False,
        limit=10,
    )
    
    run_dict[query_id] = {
        str(point.id): point.score
        for point in results.points
    }
    
multistep_run = Run(run_dict, name="multistep")
evaluate(qrels, multistep_run, metrics=["precision@10", "mrr@10"], make_comparable=True)

{'precision@10': 0.08974042027194067, 'mrr@10': 0.6655247513096709}

In [19]:
from ranx import compare

compare(
    qrels=qrels,
    runs=[
        dense_run,
        bm25_run,
        colbert_run,
        rrf_run,
        full_rrf_run,
        reranking_run,
        multistep_run,
    ],
    metrics=["precision@10", "recall@10", "mrr@10", "dcg@10", "ndcg@10"],
)

#    Model             P@10        Recall@10    MRR@10      DCG@10      NDCG@10
---  ----------------  ----------  -----------  ----------  ----------  -----------
a    all-MiniLM-L6-v2  0.089ᵇ      0.790ᵇ       0.620ᵇ      0.701ᵇ      0.655ᵇ
b    bm25              0.079       0.710        0.575       0.640       0.605
c    colbert           0.090ᵇ      0.802ᵇ       0.665ᵃᵇ     0.741ᵃᵇ     0.694ᵃᵇ
d    rrf               0.090ᵇ      0.804ᵇ       0.651ᵃᵇ     0.728ᵃᵇ     0.684ᵃᵇ
e    full_rrf          0.094ᵃᵇᶜᵈᵍ  0.834ᵃᵇᶜᵈᵍ   0.692ᵃᵇᶜᵈᵍ  0.771ᵃᵇᶜᵈᵍ  0.722ᵃᵇᶜᵈᶠᵍ
f    reranking         0.092ᵃᵇᵍ    0.820ᵃᵇᵍ     0.677ᵃᵇᶜ    0.756ᵃᵇᶜᵈᵍ  0.707ᵃᵇᶜᵈᵍ
g    multistep         0.090ᵇ      0.796ᵇ       0.666ᵃᵇ     0.741ᵃᵇ     0.693ᵃᵇ