In [1]:
from flatbuffers.packer import int32
from pymilvus import model
from pymilvus import MilvusClient, DataType

### Create Milvus connection to a file

In [2]:
client = MilvusClient("/tmp/milvus_demo.db")

client.drop_collection(collection_name="my_sparse_collection")

schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)

schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=False, max_length=100)
schema.add_field(field_name="embeddings", datatype=DataType.SPARSE_FLOAT_VECTOR)

index_params = client.prepare_index_params()

index_params.add_index(field_name="embeddings",
                               index_name="sparse_inverted_index",
                               index_type="SPARSE_INVERTED_INDEX",
                               metric_type="IP",
                               params={"drop_ratio_build": 0.2})
client.create_collection(
    collection_name="my_sparse_collection",
    schema=schema,
    index_params=index_params
)


In [3]:
embeddings_model = model.sparse.SpladeEmbeddingFunction(
    model_name="ibm-granite/granite-embedding-30m-sparse",
    device="cpu",
    batch_size=2,
    k_tokens_query=50,
    k_tokens_document=192
)

In [5]:
import os
os.getenv('CUDA_VISIBLE_DEVICES')

### Prepare documents to be ingested

In [6]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]
vecs = embeddings_model.encode_documents(docs)

In [7]:
vecs.shape[1]

50265

In [8]:
dim = vecs.shape[1]
doc_vector = [{"embeddings": doc_emb.reshape(1,dim), "id": f"item_{i}"} for i, doc_emb in enumerate(vecs)]

client.insert(
    collection_name="my_sparse_collection",
    data=doc_vector
)

{'insert_count': 3, 'ids': ['456491493136269312', '456491493136269313', '456491493136269314'], 'cost': 0}

In [9]:
import torch
tt=vecs[0].to_dense()
print(tt)

AttributeError: 'coo_array' object has no attribute 'to_dense'

### Prepare search parameters

In [26]:
search_params = {
    "params": {"drop_ratio_search": 0},  # Additional optional search parameters
}


### Prepare the query vector

In [None]:

queries = [
      "When was artificial intelligence founded",
      "Where was Turing born?"
]
query_vector = embeddings_model.encode_queries(queries)

res = client.search(
    collection_name="my_sparse_collection",
    data=query_vector,
    limit=1, #top k documents to return
    output_fields=["id"],
    search_params=search_params,
)

for r in res:
    print(r)

In [64]:
query_vector.shape

(2, 50265)

Non-transformer search

In [66]:
import numpy as np
from scipy.sparse import csr_array
tok_queries = embeddings_model.model.tokenizer(queries, return_tensors="pt")

In [81]:
max_size = embeddings_model.model.tokenizer.vocab_size
keys = [torch.unique(tok_queries['input_ids'][i], sorted=True).tolist() for i in range(0,len(queries))]

embeddings = [csr_array((k, (np.zeros(len(k), dtype=np.int32), np.ones(len(k), dtype=np.float32))), shape=(1, max_size))
                              for k in keys]

In [None]:
aa=(embeddings[0].toarray())

In [112]:
k0=keys[0]
kl=len(k0)
aa=csr_array((np.ones(kl), (np.zeros(kl), k0)), shape=(1, max_size))

In [115]:
aa.indices

array([   0,    2,   21, 1779, 2316, 4790, 7350])

In [121]:
res = client.search(
    collection_name="my_sparse_collection",
    data=[aa],
    limit=4, #top k documents to return
    output_fields=["id"],
    search_params=search_params,
)

In [122]:
print(res)

data: ["[{'id': '456491493136269312', 'distance': 3.8010401725769043, 'entity': {'id': 'item_0'}}, {'id': '456491493136269313', 'distance': 0.9097486734390259, 'entity': {'id': 'item_1'}}]"]
