### In this notebook we are going to be taking a look at different embedding types such as :

- Dense 
- Sparse
  - Splade
  - BM25

We are going to be understanding how the magic of semantic understanding works contrary to lexical or token based embeddings.

In [4]:
import pandas as pd

In [3]:
## Declaring the intended Embedding Model with Fastembed
from fastembed.embedding import TextEmbedding

pd.DataFrame(TextEmbedding.list_supported_models())

Unnamed: 0,model,dim,description,size_in_GB,sources,model_file,additional_files
0,BAAI/bge-base-en,768,"Text embeddings, Unimodal (text), English, 512...",0.42,{'url': 'https://storage.googleapis.com/qdrant...,model_optimized.onnx,
1,BAAI/bge-base-en-v1.5,768,"Text embeddings, Unimodal (text), English, 512...",0.21,{'url': 'https://storage.googleapis.com/qdrant...,model_optimized.onnx,
2,BAAI/bge-large-en-v1.5,1024,"Text embeddings, Unimodal (text), English, 512...",1.2,{'hf': 'qdrant/bge-large-en-v1.5-onnx'},model.onnx,
3,BAAI/bge-small-en,384,"Text embeddings, Unimodal (text), English, 512...",0.13,{'url': 'https://storage.googleapis.com/qdrant...,model_optimized.onnx,
4,BAAI/bge-small-en-v1.5,384,"Text embeddings, Unimodal (text), English, 512...",0.067,{'hf': 'qdrant/bge-small-en-v1.5-onnx-q'},model_optimized.onnx,
5,BAAI/bge-small-zh-v1.5,512,"Text embeddings, Unimodal (text), Chinese, 512...",0.09,{'url': 'https://storage.googleapis.com/qdrant...,model_optimized.onnx,
6,sentence-transformers/paraphrase-multilingual-...,384,"Text embeddings, Unimodal (text), Multilingual...",0.22,{'hf': 'qdrant/paraphrase-multilingual-MiniLM-...,model_optimized.onnx,
7,thenlper/gte-large,1024,"Text embeddings, Unimodal (text), English, 512...",1.2,{'hf': 'qdrant/gte-large-onnx'},model.onnx,
8,mixedbread-ai/mxbai-embed-large-v1,1024,"Text embeddings, Unimodal (text), English, 512...",0.64,{'hf': 'mixedbread-ai/mxbai-embed-large-v1'},onnx/model.onnx,
9,snowflake/snowflake-arctic-embed-xs,384,"Text embeddings, Unimodal (text), English, 512...",0.09,{'hf': 'snowflake/snowflake-arctic-embed-xs'},onnx/model.onnx,


In [6]:
##Initilising embedding model
## Using Default Model - BAAI/bge-small-en-v1.5
dense_embedding_model = TextEmbedding()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

In [7]:
##Calculating Vector similairty between 2 dense vectors 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define two vectors
vector_a = np.array(list(dense_embedding_model.embed("I like films"))).flatten().reshape(1, -1)
vector_b = np.array(list(dense_embedding_model.embed("I like movies"))).flatten().reshape(1, -1)

# Calculate cosine similarity using sklearn
similarity_score = cosine_similarity(vector_a, vector_b)

print(f"Cosine Similarity: {similarity_score[0][0]}")


Cosine Similarity: 0.939148485660553


In [8]:
## Declaring the intended Sparse Embedding Model with Fastembed
from fastembed import SparseTextEmbedding

pd.DataFrame(SparseTextEmbedding.list_supported_models())

Unnamed: 0,model,vocab_size,description,size_in_GB,sources,model_file,additional_files,requires_idf
0,prithivida/Splade_PP_en_v1,30522.0,Independent Implementation of SPLADE++ Model f...,0.532,{'hf': 'Qdrant/SPLADE_PP_en_v1'},model.onnx,,
1,prithvida/Splade_PP_en_v1,30522.0,Independent Implementation of SPLADE++ Model f...,0.532,{'hf': 'Qdrant/SPLADE_PP_en_v1'},model.onnx,,
2,Qdrant/bm42-all-minilm-l6-v2-attentions,30522.0,"Light sparse embedding model, which assigns an...",0.09,{'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'},model.onnx,[stopwords.txt],True
3,Qdrant/bm25,,BM25 as sparse embeddings meant to be used wit...,0.01,{'hf': 'Qdrant/bm25'},mock.file,"[arabic.txt, azerbaijani.txt, basque.txt, beng...",True


In [9]:
splade_embedding_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1", batch_size=32)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

In [11]:
# Convert vectors to a list of sparse vectors
embedding_a = list(splade_embedding_model.embed("I like films"))
embedding_b = list(splade_embedding_model.embed("I like movies"))

In [16]:
embedding_a

[SparseEmbedding(values=array([0.94623828, 0.39230481, 0.34555048, 0.98629087, 2.32841992,
        2.66329169, 0.38755792, 1.01045001, 0.8589344 , 1.55728137,
        0.05035508, 0.01486452, 0.07088873, 2.22820854, 2.26132703,
        0.03131655, 0.54804105, 0.12744598, 1.21080792, 0.22443028,
        0.54885274, 1.65060639, 0.46731812, 0.25375697]), indices=array([1045, 2017, 2026, 2055, 2066, 2143, 2189, 2204, 2215, 2293, 2338,
        2396, 2806, 3152, 3185, 3226, 3835, 4368, 4669, 4676, 5223, 5691,
        5988, 6907]))]

In [17]:
embedding_b

[SparseEmbedding(values=array([0.82452905, 0.2727384 , 0.3407366 , 0.90728402, 2.20379853,
        2.21208191, 0.0680519 , 1.08687437, 0.23361236, 0.69847447,
        1.52581418, 0.1723327 , 0.19406748, 0.24051629, 0.98372197,
        2.58097696, 0.42432949, 0.10421138, 0.97070301, 0.09078193,
        0.12890291, 2.3312254 , 0.42535451, 0.2519685 ]), indices=array([1045, 2017, 2026, 2055, 2066, 2143, 2189, 2204, 2208, 2215, 2293,
        2338, 2399, 2759, 3152, 3185, 3835, 4368, 4669, 4676, 5223, 5691,
        5988, 6907]))]

In [12]:
import json
from tokenizers import Tokenizer
index = 0
tokenizer = Tokenizer.from_pretrained(SparseTextEmbedding.list_supported_models()[0]["sources"]["hf"])

In [13]:
def get_tokens_and_weights(sparse_embedding, tokenizer):
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    return token_weight_dict

In [14]:
# Test the function with the first SparseEmbedding
print(json.dumps(get_tokens_and_weights(embedding_a[0], tokenizer), indent=4))

{
    "film": 2.6632916927337646,
    "like": 2.3284199237823486,
    "movie": 2.261327028274536,
    "films": 2.228208541870117,
    "movies": 1.650606393814087,
    "love": 1.5572813749313354,
    "liked": 1.2108079195022583,
    "good": 1.010450005531311,
    "about": 0.9862908720970154,
    "i": 0.9462382793426514,
    "want": 0.8589344024658203,
    "hate": 0.5488527417182922,
    "nice": 0.548041045665741,
    "cinema": 0.4673181176185608,
    "you": 0.39230480790138245,
    "music": 0.3875579237937927,
    "my": 0.3455504775047302,
    "genre": 0.25375697016716003,
    "religion": 0.22443027794361115,
    "sport": 0.1274459809064865,
    "style": 0.07088872790336609,
    "book": 0.05035507678985596,
    "culture": 0.031316548585891724,
    "art": 0.014864521101117134
}


In [15]:
# Test the function with the first SparseEmbedding
print(json.dumps(get_tokens_and_weights(embedding_b[0], tokenizer), indent=4))

{
    "movie": 2.580976963043213,
    "movies": 2.3312253952026367,
    "film": 2.2120819091796875,
    "like": 2.203798532485962,
    "love": 1.525814175605774,
    "good": 1.0868743658065796,
    "films": 0.9837219715118408,
    "liked": 0.9707030057907104,
    "about": 0.9072840213775635,
    "i": 0.8245290517807007,
    "want": 0.6984744668006897,
    "cinema": 0.4253545105457306,
    "nice": 0.4243294894695282,
    "my": 0.34073659777641296,
    "you": 0.27273839712142944,
    "genre": 0.25196850299835205,
    "popular": 0.2405162900686264,
    "game": 0.23361235857009888,
    "games": 0.19406747817993164,
    "book": 0.17233270406723022,
    "hate": 0.12890291213989258,
    "sport": 0.10421138256788254,
    "religion": 0.09078193455934525,
    "music": 0.06805189698934555
}


In [19]:
import numpy as np

##This formula calculates the similarity score by multiplying corresponding elements of the document1 and document2 vectors 
##and summing these products. 
##This method is particularly effective with sparse vectors, where many elements are zero, 
##leading to a computationally efficient process. 
##The higher the score, the greater the similarity between the document1 and the document2, 
##making it a valuable metric for assessing the similarity of documents.


def sparse_vector_similarity(emb1, emb2):
    # Convert sparse embeddings to dictionaries for fast lookup
    dict_emb1 = dict(zip(emb1.indices, emb1.values))
    dict_emb2 = dict(zip(emb2.indices, emb2.values))
    
    # Find common indices
    common_indices = set(dict_emb1.keys()).intersection(dict_emb2.keys())
    
    # Calculate similarity score by summing the product of corresponding values
    similarity_score = sum(dict_emb1[idx] * dict_emb2[idx] for idx in common_indices)
    
    return similarity_score



# Calculate similarity
similarity = sparse_vector_similarity(embedding_a[0], embedding_b[0])
print(f"Similarity score: {similarity}")


Similarity score: 30.68323275421868


In [21]:
from fastembed.sparse.bm25 import Bm25
bm25_embedding_model = Bm25("Qdrant/bm25")

Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

chinese.txt:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

danish.txt:   0%|          | 0.00/424 [00:00<?, ?B/s]

azerbaijani.txt:   0%|          | 0.00/967 [00:00<?, ?B/s]

arabic.txt:   0%|          | 0.00/6.35k [00:00<?, ?B/s]

basque.txt:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

catalan.txt:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

bengali.txt:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

english.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

finnish.txt:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

french.txt:   0%|          | 0.00/813 [00:00<?, ?B/s]

dutch.txt:   0%|          | 0.00/453 [00:00<?, ?B/s]

hinglish.txt:   0%|          | 0.00/5.96k [00:00<?, ?B/s]

greek.txt:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

hungarian.txt:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

indonesian.txt:   0%|          | 0.00/6.45k [00:00<?, ?B/s]

italian.txt:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

german.txt:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

hebrew.txt:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

kazakh.txt:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

norwegian.txt:   0%|          | 0.00/851 [00:00<?, ?B/s]

nepali.txt:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

portuguese.txt:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

romanian.txt:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

slovene.txt:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

russian.txt:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

spanish.txt:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

swedish.txt:   0%|          | 0.00/559 [00:00<?, ?B/s]

tajik.txt:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

turkish.txt:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [22]:
# Convert vectors to a list of sparse vectors
bm25_embedding_a = list(bm25_embedding_model.embed("I like films"))
bm25_embedding_b = list(bm25_embedding_model.embed("I like movies"))

In [23]:
bm25_embedding_a

[SparseEmbedding(values=array([1.67868852, 1.67868852, 1.67868852]), indices=array([1418846154,  366928855,  103666345]))]

In [24]:
bm25_embedding_b

[SparseEmbedding(values=array([1.67868852, 1.67868852, 1.67868852]), indices=array([1418846154,  366928855,  179795244]))]

In [25]:
import json
from tokenizers import Tokenizer
index = 0
tokenizer_bm25 = Tokenizer.from_pretrained(SparseTextEmbedding.list_supported_models()[2]["sources"]["hf"])

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

In [34]:
similarity_bm25 = sparse_vector_similarity(bm25_embedding_a[0], bm25_embedding_b[0])
print(f"BM25 Similarity score: {similarity_bm25}")


BM25 Similarity score: 5.635990325181404
