In [1]:
from pymilvus import model
from pymilvus import MilvusClient, DataType
import json
import math

In [2]:
client = MilvusClient("./milvus_demo.db")

client.drop_collection(collection_name="my_sparse_collection")

schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)

schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=False, max_length=100)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, is_primary=False, max_length=10000)
schema.add_field(field_name="embeddings", datatype=DataType.SPARSE_FLOAT_VECTOR)

  from pkg_resources import DistributionNotFound, get_distribution


{'auto_id': True, 'description': '', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 10000}}, {'name': 'embeddings', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [3]:
index_params = client.prepare_index_params()

index_params.add_index(field_name="embeddings",
                       index_name="sparse_inverted_index",
                       index_type="SPARSE_INVERTED_INDEX",
                       metric_type="IP",
                       params={"drop_ratio_build": 0.2}
                       )

In [4]:
client.create_collection(
    collection_name="my_sparse_collection",
    schema=schema,
    index_params=index_params
)

In [None]:
embeddings_model = model.sparse.SpladeEmbeddingFunction(
    model_name="ibm-granite/granite-embedding-30m-sparse",
    device="cuda",
    batch_size=2,
    k_tokens_query=50,
    k_tokens_document=192
)

In [6]:
from docuverse.utils.embeddings.sparse_embedding_function import SparseEmbeddingFunction
embeddings_model1 = SparseEmbeddingFunction("ibm-granite/granite-embedding-30m-sparse",
                                            batch_size=1,
                                            doc_max_tokens=192,
                                            query_max_tokens=50,
                                            process_name="ingestion")
embeddings_model = embeddings_model1

We're on a Mac !!
You are using cpu. This is much slower than using a CUDA-enabled GPU. If on Colab you can change this by clicking Runtime > Change runtime type > GPU.
=== done initializing model


In [7]:
# Prepare documents to be ingested
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

# SpladeEmbeddingFunction.encode_documents returns sparse matrix or sparse array depending
# on the milvus-model version. reshape(1,-1) ensures the format is correct for ingestion.
doc_vector = [{"embeddings": doc_emb.reshape(1,-1),
               "text": doc_text,
               "id": f"item_{i}"}
              for i, (doc_emb, doc_text) in enumerate(zip(embeddings_model.encode_documents(docs), docs))]


client.insert(
    collection_name="my_sparse_collection",
    data=doc_vector
)

Processed candidates: 100%|██████████| 3/3 [00:00<00:00, 44.76it/s]


{'insert_count': 3, 'ids': ['460587611038416896', '460587611038416897', '460587611038416898'], 'cost': 0}

In [8]:
# Prepare search parameters
search_params = {
    "params": {"drop_ratio_search": 0.2},  # Additional optional search parameters
}

# Prepare the query vector

queries = [
    "When was artificial intelligence founded",
    "Where was Turing born?",
    "Who was the first person to work in AI?"
]
answers = [
    'item0',
    'item2',
    'item1'
]
query_vector = embeddings_model.encode_documents(queries)

Processed candidates: 100%|██████████| 3/3 [00:00<00:00, 66.14it/s]


In [9]:
res = client.search(
    collection_name="my_sparse_collection",
    data=query_vector,
    limit=2, #top k documents to return
    output_fields=["id", "text", "embeddings"],
    search_params=search_params,
)

for r in res:
    print(r)

[{'id': '460587611038416896', 'distance': 12.364130973815918, 'entity': {'id': 'item_0', 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'embeddings': {2: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 36: 0.0, 37: 0.0, 38: 0.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 0.0, 43: 0.0, 44: 0.0, 45: 0.0, 46: 0.0, 47: 0.0, 48: 0.0, 49: 0.0, 50: 0.0, 51: 0.0, 52: 0.0, 53: 0.0, 54: 0.0, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 0.0, 73: 0.0, 74: 0.0, 75: 0.0, 76: 0.0, 77: 0.0, 78: 0.0, 79: 0.0, 80: 0.0, 81: 0.0, 82: 0.0, 83: 0.0, 84: 0.0, 85: 0.0, 86: 0.0, 87: 0.0, 88: 0.0, 89: 0.0, 90: 0.0, 91: 0.0, 92: 0.0, 93: 0.0, 94: 0.0, 95: 0.0, 96: 0.0, 97: 0.0, 98: 0.0, 99: 0.0, 100: 0.0, 101: 0.0, 102: 0.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0

In [43]:
ee=res[0][0].data['entity']['embeddings']
def get_vector(e):
    if hasattr(e, 'data'):
        e = e.data['entity']['embeddings']
    aa = sorted([(k, v) for k, v in ee.items() if math.fabs(v)>0.001], key=lambda x: x[1], reverse=True)
    return embeddings_model.model.convert_token_ids_to_tokens([aa])

def get_query_vector(query):
    # return {f"{i}": float(query.data[j]) for j, i in enumerate(query.indices) if query_vector[0].data[j]>0}
    aa = []
    for i, j in zip(query.indices, query.data):
        if j > 0:
            aa.append([int(i), float(j)])

    return embeddings_model.model.convert_token_ids_to_tokens([sorted(aa, key=lambda x: x[1], reverse=True)])


In [14]:
example = res[0][0]
print(f"Text: {example.text}")
get_vector(example.data['entity']['embeddings'])

Text: Artificial intelligence was founded as an academic discipline in 1956.


[[('ĠAI', 1.6671509742736816),
  ('Ġintelligence', 1.4905364513397217),
  ('Ġartificial', 1.2501306533813477),
  ('Ġdiscipline', 1.2192906141281128),
  ('Ġfounded', 1.0603737831115723),
  ('Ġ1956', 1.0351004600524902),
  ('Ġinvention', 0.9785782694816589),
  ('56', 0.7224237322807312),
  ('Ġlearning', 0.6999133229255676),
  ('Ġscientific', 0.6892702579498291),
  ('Ġcomputer', 0.6566588878631592),
  ('Ġacademic', 0.621738851070404),
  ('Ġuniversity', 0.5886272192001343),
  ('Ġrobot', 0.5613628029823303),
  ('Ġestablishment', 0.5508416295051575),
  ('Ġphilosophy', 0.5431844592094421),
  ('A', 0.502594530582428),
  ('Ġbrain', 0.47637805342674255),
  ('Ġmachine', 0.44881123304367065),
  ('1960', 0.4464936852455139),
  ('1950', 0.4327624440193176),
  ('Ġalgorithm', 0.4083285331726074),
  ('Ġscience', 0.37494128942489624),
  ('Ġregression', 0.3722943663597107),
  ('Ġcomput', 0.330303817987442),
  ('ĠDiscipline', 0.32480648159980774),
  ('Ġinstitute', 0.31568580865859985),
  ('Ġautomatic', 0.

In [44]:
qid = 0
example = query_vector[qid]
print(f"Query text: {queries[qid]}")
get_query_vector(example)
# print(f"Text: {example.text}")
# get_vector(example.data['entity']['embeddings'])

Query text: When was artificial intelligence founded


[[('ĠAI', 1.6549731492996216),
  ('Ġintelligence', 1.5411134958267212),
  ('Ġartificial', 1.4121360778808594),
  ('Ġfounded', 1.2499065399169922),
  ('Ġinvention', 1.0022627115249634),
  ('Ġcomputer', 0.7292882204055786),
  ('Ġlearning', 0.6924787759780884),
  ('Ġrobot', 0.617355227470398),
  ('Ġestablishment', 0.6119297742843628),
  ('Ġtechnology', 0.47791969776153564),
  ('ĠRobot', 0.406949907541275),
  ('Ġscientific', 0.40680572390556335),
  ('Ġmachine', 0.39789462089538574),
  ('A', 0.3954956829547882),
  ('ĠInternet', 0.3934810757637024),
  ('Ġbrain', 0.3785928785800934),
  ('Ġalgorithm', 0.35148316621780396),
  ('history', 0.3513850271701813),
  ('Ġphilosophy', 0.3472434878349304),
  ('ĠEvolution', 0.30035871267318726),
  ('ĠData', 0.22485089302062988),
  ('Ġautomatic', 0.21477273106575012),
  ('Ġdeveloped', 0.16173367202281952),
  ('Ġscience', 0.16117222607135773),
  ('Ġregression', 0.14970384538173676),
  ('Ġlogic', 0.14649289846420288),
  ('Ġsynthetic', 0.12041711062192917),
 

In [16]:
query_vector[0]
# get_vector(doc_vector[0]['embeddings'])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 192 stored elements and shape (1, 50265)>

In [113]:
{f"{i}": float(query_vector[0].data[j]) for j, i in enumerate(query_vector[0].indices) if query_vector[0].data[j]>0}

{'250': 0.39453125,
 '806': 0.4765625,
 '2226': 0.1787109375,
 '2239': 0.69140625,
 '2316': 1.546875,
 '2866': 0.171875,
 '2900': 0.369140625,
 '3034': 0.73046875,
 '3563': 0.40625,
 '3742': 0.39453125,
 '4687': 1.65625,
 '4790': 1.2578125,
 '5423': 0.2353515625,
 '6441': 0.41015625,
 '7147': 0.61328125,
 '7350': 1.4140625,
 '8408': 0.2041015625,
 '9916': 0.61328125,
 '10561': 0.345703125,
 '11767': 0.03076171875,
 '14578': 0.1318359375,
 '16807': 0.138671875,
 '17194': 0.3359375,
 '20257': 0.0966796875,
 '26101': 1.0,
 '28034': 0.30078125,
 '29991': 0.08984375,
 '31024': 0.40625,
 '37283': 0.357421875,
 '39974': 0.1455078125}