# Module 1: MaxSim Distance Metric

In [1]:
query = "apple computer"
document = "Apple makes the MacBook laptop"

In [2]:
from fastembed import LateInteractionTextEmbedding

# Load the colbert-ir/colbertv2.0 model
colbert_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

# Create multi-vector representations of the query and document
query_vector = next(colbert_model.query_embed(query))
document_vector = next(colbert_model.passage_embed(document))

In [3]:
query_tokenization = colbert_model.model.tokenize([query])[0]
query_tokenization.tokens

['[CLS]', 'apple', 'computer', '[SEP]']

In [4]:
document_tokenization = colbert_model.model.tokenize([document])[0]
document_tokenization.tokens

['[CLS]', 'apple', 'makes', 'the', 'mac', '##book', 'laptop', '[SEP]']

In [15]:
import numpy as np

similarity = 0.0
for qt, qt_vector in zip(query_tokenization.tokens, 
                         query_vector):
    max_idx, max_sim = 0, np.dot(qt_vector, document_vector[0])
    for i, dt_vector in enumerate(document_vector[1:], start=1):
        distance = np.dot(qt_vector, dt_vector)
        if distance > max_sim:
            max_idx, max_sim = i, distance

    print(qt, max_idx, max_sim)
    similarity += max_sim

[CLS] 8 0.85443306
apple 2 0.8940004
computer 2 0.9118513
[SEP] 7 0.6615789


In [16]:
print("MaxSim(Q, D) =", similarity)

MaxSim(Q, D) = 3.3218636512756348


In [19]:
similarity = 0.0
for dt, dt_vector in zip(document_tokenization.tokens, 
                         document_vector):
    max_idx, max_sim = 0, np.dot(dt_vector, query_vector[0])
    for i, qt_vector in enumerate(query_vector[1:], start=1):
        distance = np.dot(dt_vector, qt_vector)
        if distance > max_sim:
            max_idx, max_sim = i, distance

    print(dt, max_idx, max_sim)
    similarity += max_sim

[CLS] 0 0.8310971
apple 22 0.64755857
makes 2 0.9118513
the 0 0.38841254
mac 12 0.67822593
##book 21 0.46869862
laptop 20 0.37343562
[SEP] 5 0.680282


In [20]:
print("MaxSim(D, Q) =", similarity)

MaxSim(D, Q) = 4.979561686515808
