# Qdrant indexing

In [1]:
from datasets import load_dataset

dataset = load_dataset("BeIR/scifact", "corpus", split="corpus")
dataset[0]

{'_id': '4983',
 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.',
 'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, t

In [2]:
len(dataset)

5183

## Dense embeddings

We're not going to choose the fanciest embedding model out there, but stick to something simple and efficient. FastEmbed comes with some pretrained models that we can use out of the box. Due to ONNX usage, these models can be launched efficiently even on a CPU. The `all-MiniLM-L6-v2` model is a lightweight model that's good for a start.

In [3]:
from fastembed.embedding import TextEmbedding

dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
dense_embeddings = list(dense_embedding_model.passage_embed(dataset["text"][0:1]))
len(dense_embeddings)



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

1

In [4]:
len(dense_embeddings[0])

384

## Sparse embeddings

Similarly, we can use a BM25 model to generate sparse embeddings, so it hopefully will handle the cases in which the dense embeddings fail. 

In [5]:
from fastembed.sparse.bm25 import Bm25

bm25_embedding_model = Bm25("Qdrant/bm25")
bm25_embeddings = list(bm25_embedding_model.passage_embed(dataset["text"][0:1]))
len(bm25_embeddings)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

1

## Late interaction embeddings

Both dense and sparse embeddings we generated so far produce a single embedding per document. However, there is also a different approach, where we can generate a single embedding per token. For each query token we take its maximum similarity against the document and sum across query terms. This kind of models is called late interaction models, and ColBERT is one of them.

In [6]:
from fastembed.late_interaction import LateInteractionTextEmbedding

late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")
late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(dataset["text"][0:1]))
len(late_interaction_embeddings)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

1

In [8]:
len(late_interaction_embeddings[0])

431

## Putting data in a Qdrant collection

All the vectors might be now upserted into a Qdrant collection. Keeping them all in a single one enables the possibility to combine different embeddings and create even a complex pipeline with several steps. Depending on the specifics of your data, you may prefer to use a different approach.

In [9]:
!docker run -d -p 6333:6333 -p 6334:6334 qdrant/qdrant:v1.10.0

4cea045985b0ea2b1f1785a9f3156f49d404f3d387e4919205ad59c0a05854cc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333", timeout=600)
client.create_collection(
    "scifact",
    vectors_config={
        "all-MiniLM-L6-v2": models.VectorParams(
            size=len(dense_embeddings[0]),
            distance=models.Distance.COSINE,
        ),
        "colbertv2.0": models.VectorParams(
            size=len(late_interaction_embeddings[0][0]),
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [11]:
import tqdm

batch_size = 4
for batch in tqdm.tqdm(dataset.iter(batch_size=batch_size), 
                       total=len(dataset) // batch_size):
    dense_embeddings = list(dense_embedding_model.passage_embed(batch["text"]))
    bm25_embeddings = list(bm25_embedding_model.passage_embed(batch["text"]))
    late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(batch["text"]))
    
    client.upload_points(
        "scifact",
        points=[
            models.PointStruct(
                id=int(batch["_id"][i]),
                vector={
                    "all-MiniLM-L6-v2": dense_embeddings[i].tolist(),
                    "bm25": bm25_embeddings[i].as_object(),
                    "colbertv2.0": late_interaction_embeddings[i].tolist(),
                },
                payload={
                    "_id": batch["_id"][i],
                    "title": batch["title"][i],
                    "text": batch["text"][i],
                }
            )
            for i, _ in enumerate(batch["_id"])
        ],
        # We send a lot of embeddings at once, so it's best to reduce the batch size.
        # Otherwise, we would have gigantic requests sent for each batch and we can
        # easily reach the maximum size of a single request.
        batch_size=batch_size,  
    )

  1%|█                                                                                                                                                                                             | 7/1295 [00:17<53:55,  2.51s/it]


KeyboardInterrupt: 

In [12]:
client.recover_snapshot(
    "scifact",
    location="https://storage.googleapis.com/common-datasets-snapshots/scifact-multiple-representations.snapshot",
)

True

In [13]:
client.get_collection("scifact")

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=15549, points_count=5183, segments_count=7, config=CollectionConfig(params=CollectionParams(vectors={'all-MiniLM-L6-v2': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), 'colbertv2.0': VectorParams(size=128, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=MultiVectorConfig(comparator=<MultiVectorComparator.MAX_SIM: 'max_sim'>))}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'bm25': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, o