In [None]:
#pip install -q fastembed

In [1]:
from fastembed import SparseTextEmbedding, SparseEmbedding
from typing import List

In [2]:
SparseTextEmbedding.list_supported_models()

[{'model': 'prithivida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'prithvida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',
  'vocab_size': 30522,
  'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',
  'license': 'apache-2.0',
  'size_in_GB': 0.09,
  'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'},
  'model_file': 'model.onnx',
  'additional_files': ['stopwords.txt'],
  'requires_idf': True},
 {'model': 'Qdrant/bm25',
  'description': 'BM25 as sparse embedd

In [4]:
model_name = "prithivida/Splade_PP_en_v1"
# This triggers the model download
model = SparseTextEmbedding(model_name=model_name)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
documents: List[str] = [
    "Chandrayaan-3 is India's third lunar mission",
    "It aimed to land a rover on the Moon's surface - joining the US, China and Russia",
    "The mission is a follow-up to Chandrayaan-2, which had partial success",
    "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)",
    "The estimated cost of the mission is around $35 million",
    "It will carry instruments to study the lunar surface and atmosphere",
    "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023",
    "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.",
    "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit",
    "The mission used GSLV Mk III rocket for its launch",
    "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota",
    "Chandrayaan-3 was launched earlier in the year 2023",
]

In [6]:
sparse_embeddings_list: List[SparseEmbedding] = list(
    model.embed(documents, batch_size=6)
)

In [7]:
index = 0
sparse_embeddings_list[index]

SparseEmbedding(values=array([0.05297344, 0.01963442, 0.36459157, 1.38508499, 0.71776628,
       0.12668033, 0.46230859, 0.44676831, 0.26897514, 1.01519847,
       1.5655334 , 0.29411697, 1.53102303, 0.59785676, 1.10018146,
       0.02079611, 0.09955791, 0.44249129, 0.09747908, 1.53519869,
       1.36765587, 0.15740731, 0.49882406, 0.38628644, 0.76612771,
       1.2580514 , 0.3905834 , 0.27236396, 0.45152026, 0.48261768,
       0.26084885, 1.35912943, 0.70710433, 1.71639705]), indices=array([ 1010,  1011,  1016,  1017,  2001,  2018,  2034,  2093,  2117,
        2319,  2353,  2509,  2634,  2686,  2796,  2817,  2922,  2959,
        3003,  3148,  3260,  3390,  3462,  3523,  3822,  4231,  4316,
        4774,  5590,  5871,  6416, 11926, 12076, 16469]))

In [8]:
for i in range(5):
    print(f"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}")

Token at index 1010 has weight 0.052973438054323196
Token at index 1011 has weight 0.019634416326880455
Token at index 1016 has weight 0.3645915687084198
Token at index 1017 has weight 1.3850849866867065
Token at index 2001 has weight 0.717766284942627


In [9]:
import json
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained(SparseTextEmbedding.list_supported_models()[0]["sources"]["hf"])

tokenizer.json: 0.00B [00:00, ?B/s]

In [10]:
def get_tokens_and_weights(sparse_embedding, tokenizer):
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    return token_weight_dict

# Test the function with the first SparseEmbedding
print(json.dumps(get_tokens_and_weights(sparse_embeddings_list[index], tokenizer), indent=4))

{
    "chandra": 1.7163970470428467,
    "third": 1.5655333995819092,
    "##ya": 1.53519868850708,
    "india": 1.5310230255126953,
    "3": 1.3850849866867065,
    "mission": 1.367655873298645,
    "lunar": 1.3591294288635254,
    "moon": 1.2580513954162598,
    "indian": 1.1001814603805542,
    "##an": 1.0151984691619873,
    "3rd": 0.7661277055740356,
    "was": 0.717766284942627,
    "spacecraft": 0.7071043252944946,
    "space": 0.5978567600250244,
    "flight": 0.4988240599632263,
    "satellite": 0.48261767625808716,
    "first": 0.4623085856437683,
    "expedition": 0.45152026414871216,
    "three": 0.44676831364631653,
    "fourth": 0.4424912929534912,
    "vehicle": 0.3905833959579468,
    "iii": 0.3862864375114441,
    "2": 0.3645915687084198,
    "##3": 0.2941169738769531,
    "planet": 0.27236396074295044,
    "second": 0.2689751386642456,
    "missions": 0.26084885001182556,
    "launched": 0.15740731358528137,
    "had": 0.12668032944202423,
    "largest": 0.09955791383