In [1]:
%pip install sentence-transformers==4.1.0 | tail -n 1

Note: you may need to restart the kernel to use updated packages.


In [4]:
import math
import tqdm as notebook_tqdm

import numpy as np
import scipy
import torch
from sentence_transformers import SentenceTransformer

In [5]:
# Example documents
documents = [
    'Bugs introduced by the intern had to be squashed by the lead developer.',
    'Bugs found by the quality assurance engineer were difficult to debug.',
    'Bugs are common throughout the warm summer months, according to the entomologist.',
    'Bugs, in particular spiders, are extensively studied by arachnologists.'
]

In [6]:
# Load a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [7]:
embeddings = model.encode(documents)

In [8]:
embeddings.shape

(4, 384)

# Manual implementation of L2(Euclidean) distance calculation


In [9]:
def euclidean_distance_fn(vector1, vector2):
    squared_sum = sum((x - y) ** 2 for x, y in zip(vector1, vector2))
    return math.sqrt(squared_sum)

In [10]:
euclidean_distance_fn(embeddings[0], embeddings[1])

5.96179017134276

In [None]:
euclidean_distance_fn(embeddings[1], embeddings[0])
# Vector positioning for arguments to function does not matter 

5.96179017134276

In [12]:
print(embeddings[0])

[-0.22804333 -0.246477   -0.00319237 -0.4011369   0.58040327 -0.07540709
 -0.1585729   0.24768439 -0.15160525  0.01536422  0.00947509  0.4250167
  0.18076804 -0.04996739 -0.17622906  0.2744977  -0.632379   -0.08400841
 -0.01563674  0.3895156  -0.4088379  -0.07366164  0.21534097 -0.3809455
 -0.03634035  0.38411656  0.16714825 -0.5633113  -0.03045717 -0.15912974
  0.28667477  0.27080914  0.49389985 -0.06697895  0.15861714 -0.0722226
 -0.27289623 -0.07229342  0.15310524 -0.103544   -0.20727664  0.4120065
  0.04627789 -0.24674499 -0.15725222 -0.52524614 -0.40528253 -0.09747362
 -0.06160371 -0.32968023  0.31149215 -0.03460407 -0.22514832  0.04832362
 -0.04793397 -0.05118244  0.30874708  0.36401907 -0.14748739  0.64699197
  0.12886815 -0.05396593 -0.0100518   0.14999984  0.15292728 -0.27913466
 -0.07968055 -0.4148341   0.11531564 -0.20606102 -0.06508962 -0.40328267
 -0.5096472   0.065418    0.48177364 -0.06020098  0.07172662  0.3426946
  0.4951734  -0.07550937 -0.1714772  -0.17710747  0.0199

In [20]:
l2_dist_manual = np.zeros([4,4])
for i in range(embeddings.shape[0]):
    for j in range(embeddings.shape[0]):
        l2_dist_manual[i,j] = euclidean_distance_fn(embeddings[j], embeddings[i])

l2_dist_manual

array([[0.        , 5.96179017, 7.33939885, 7.15578169],
       [5.96179017, 0.        , 7.76861748, 7.393591  ],
       [7.33939885, 7.76861748, 0.        , 5.919928  ],
       [7.15578169, 7.393591  , 5.919928  , 0.        ]])

In [16]:
l2_dist_manual = np.zeros([4,4])
for i in range(embeddings.shape[0]):
    for j in range(embeddings.shape[0]):
        l2_dist_manual[i,j] = euclidean_distance_fn(embeddings[i], embeddings[j])

l2_dist_manual

array([[0.        , 5.96179017, 7.33939885, 7.15578169],
       [5.96179017, 0.        , 7.76861748, 7.393591  ],
       [7.33939885, 7.76861748, 0.        , 5.919928  ],
       [7.15578169, 7.393591  , 5.919928  , 0.        ]])

In [17]:
# l2_dist_manual is a 4×4 array where each element represents the L2 distance between two vectors: the vector at the row index and the vector at the column index. For example, the distance between the first vector (index 0) and the second vector (index 1) is located at position [0, 1] in the array:

In [18]:
l2_dist_manual[0,1]

np.float64(5.96179017134276)

In [19]:
l2_dist_manual[1,0]

np.float64(5.96179017134276)

In [21]:
# Exercise 1 - Make the manual calculation more efficient 

In [22]:
# The code used to populate the `l2_dist_manual array` is not very efficient. First, it redundantly calculates the distance between a vector and itself, even though the L2 distance in such cases is always zero. Second, the array is symmetric—meaning the distance between vectors at indices $i$ and $j$ is the same as between $j$ and $i$. Therefore, each distance only needs to be computed once.

#In the cell below, write an improved version of the code that avoids these inefficiencies.

In [23]:
l2_dist_manual_improved = np.zeros([4,4])
for i in range(embeddings.shape[0]):
    for j in range(embeddings.shape[0]):
        if j > i: # Calculate the upper triangle only
            l2_dist_manual_improved[i,j] = euclidean_distance_fn(embeddings[i], embeddings[j])
        elif i > j: # Copy the uper triangle to the lower triangle
            l2_dist_manual_improved[i,j] = l2_dist_manual[j,i]

l2_dist_manual_improved

array([[0.        , 5.96179017, 7.33939885, 7.15578169],
       [5.96179017, 0.        , 7.76861748, 7.393591  ],
       [7.33939885, 7.76861748, 0.        , 5.919928  ],
       [7.15578169, 7.393591  , 5.919928  , 0.        ]])

In [24]:
#Calculate L2 distance using numpy
l2_dist_scipy = scipy.spatial.distance.cdist(embeddings, embeddings, 'euclidean')
l2_dist_scipy

array([[0.        , 5.96179018, 7.33940023, 7.15578141],
       [5.96179018, 0.        , 7.76861766, 7.39359074],
       [7.33940023, 7.76861766, 0.        , 5.91992798],
       [7.15578141, 7.39359074, 5.91992798, 0.        ]])

In [25]:
np.allclose(l2_dist_manual, l2_dist_scipy)

True

In [26]:
#Interpret the L2 Distance Results
#An analysis of l2_dist_scipy shows that, in this case, the L2 distance metric performed well for similarity search. For example, the first vector—corresponding to the sentence Bugs introduced by the intern had to be squashed by the lead developer.—had the smallest distance to the second vector, which represents the sentence Bugs found by the quality assurance engineer were difficult to debug. This result aligns with expectations, as both sentences refer to programming bugs.

#Similarly, the third and fourth sentences—both related to physical bugs rather than programming—were closest to each other in terms of distance, which again matches our intuition.

In [28]:
#Dot Product Similarity and Distance
# Manual implementation of dot product calculation
def dot_product_fn(vector1, vector2):
    return sum(x * y for x, y in zip(vector1, vector2))

In [29]:
dot_product_fn(embeddings[0], embeddings[1])

np.float32(18.535393)

In [30]:
dot_product_manual = np.empty([4,4])
for i in range(embeddings.shape[0]):
    for j in range(embeddings.shape[0]):
        dot_product_manual[i,j] = dot_product_fn(embeddings[i], embeddings[j])

dot_product_manual

array([[33.74440002, 18.53539276,  8.56981659,  7.83093262],
       [18.53539276, 38.86931992,  7.88995934,  8.66340637],
       [ 8.56981659,  7.88995934, 37.2620163 , 17.66956329],
       [ 7.83093262,  8.66340637, 17.66956329, 33.12266541]])

In [31]:
#Calculate the dot product using matrix multiplication
#We can compute the dot product efficiently using the matrix multiplication operator @. To do this, we multiply the embeddings matrix by its transpose. Since embeddings has a shape of 4×384, its transpose will be 384×4. Multiplying these gives us a 4×4 matrix, which is the desired result:

In [32]:
# Matrix multiplication operator
dot_product_operator = embeddings @ embeddings.T
dot_product_operator

array([[33.744415 , 18.5354   ,  8.569814 ,  7.8309345],
       [18.5354   , 38.86933  ,  7.889962 ,  8.663405 ],
       [ 8.569814 ,  7.889962 , 37.262016 , 17.669561 ],
       [ 7.8309345,  8.663405 , 17.669561 , 33.122658 ]], dtype=float32)

In [33]:
#We can verify that the matrix multiplication operator returns the same result as our custom function after accounting for rounding:

In [34]:
np.allclose(dot_product_manual, dot_product_operator, atol=1e-05)

True

In [None]:
#Equivalently, if both of the matrices we want to multiply are two-dimensional, we can use the matmul() function from numpy instead:

SyntaxError: invalid syntax (3278031269.py, line 1)

In [36]:
# Equivalent to `np.matmul()` if both arrays are 2-D:
np.matmul(embeddings,embeddings.T)

array([[33.744415 , 18.5354   ,  8.569814 ,  7.8309345],
       [18.5354   , 38.86933  ,  7.889962 ,  8.663405 ],
       [ 8.569814 ,  7.889962 , 37.262016 , 17.669561 ],
       [ 7.8309345,  8.663405 , 17.669561 , 33.122658 ]], dtype=float32)

In [37]:
# `np.dot` returns an identical result, but `np.matmul` is recommended if both arrays are 2-D:
np.dot(embeddings,embeddings.T)

array([[33.744415 , 18.5354   ,  8.569814 ,  7.8309345],
       [18.5354   , 38.86933  ,  7.889962 ,  8.663405 ],
       [ 8.569814 ,  7.889962 , 37.262016 , 17.669561 ],
       [ 7.8309345,  8.663405 , 17.669561 , 33.122658 ]], dtype=float32)

In [39]:
#Calculate dot product distance
# The dot product between two vectors provides a similarity score. If, on the other hand, we would like a distance, we can simply take the negative of the dot product:

In [40]:
dot_product_distance = -dot_product_manual
dot_product_distance

array([[-33.74440002, -18.53539276,  -8.56981659,  -7.83093262],
       [-18.53539276, -38.86931992,  -7.88995934,  -8.66340637],
       [ -8.56981659,  -7.88995934, -37.2620163 , -17.66956329],
       [ -7.83093262,  -8.66340637, -17.66956329, -33.12266541]])

In [41]:
# Although it might seem unusual for all the distances to be negative, the essential property of a distance metric is still preserved: smaller values indicate lower distance and thus greater similarity. So even with negative values, the relative comparisons remain valid—lower values still correspond to shorter distances.

In [42]:
#Cosine Similarity and Distance

In [43]:
#Manual implementation of cosine similarity calculation¶
#Since we’ve already covered how to compute the dot product, our strategy for manually calculating cosine similarity will focus on normalizing the vectors. This is because cosine similarity is simply the dot product of two normalized vectors, as was shown after the last equals sign in the cosine similarity calculation formula.

#However, in order to normalize vectors, we must first compute their L2 norms.

#Calculate the L2 norm
#The following calculates the L2 norms for all the vectors in the embeddings array. The calculation simply squares each vector component, sums across columns (note the axis=1 parameter in the sum), and takes a square root:

In [44]:
# L2 norms
l2_norms = np.sqrt(np.sum(embeddings**2, axis=1))
l2_norms

array([5.8089943, 6.2345276, 6.1042614, 5.7552285], dtype=float32)

In [45]:
# L2 norms reshaped
l2_norms_reshaped = l2_norms.reshape(-1,1)
l2_norms_reshaped

array([[5.8089943],
       [6.2345276],
       [6.1042614],
       [5.7552285]], dtype=float32)

In [46]:
# Normalize embedding vectors

#The following code calculates normalized embedding vectors by dividing every component in the vector by the vector's L2 norm:

In [47]:
normalized_embeddings_manual = embeddings/l2_norms_reshaped
normalized_embeddings_manual

array([[-0.03925694, -0.04243023, -0.00054956, ...,  0.07837522,
         0.10917508,  0.09252975],
       [-0.05740864, -0.05146182,  0.02560457, ..., -0.01130911,
         0.14876871,  0.05514007],
       [ 0.03326033, -0.0440653 ,  0.02667837, ..., -0.03219225,
        -0.00553686,  0.09757371],
       [-0.00740946, -0.07944359, -0.01655278, ..., -0.10083131,
         0.02996998,  0.0158601 ]], shape=(4, 384), dtype=float32)

In [48]:
#Exercise 2 - Verify that vectors are normalized¶

#Verify that normalized_embeddings_manual are normalized vectors by making sure that the length of each vector is equal to 1.

In [49]:

# Note that the length of a vector can be found by taking its L2 norm. So, to solve this exercise, you just have to calculate the L2 norm of `normalized_embeddings_manual`, and verify that the sums are equal to or very close to 1:

np.sqrt(np.sum(normalized_embeddings_manual**2, axis=1))

array([1.        , 0.99999994, 1.        , 1.        ], dtype=float32)

In [50]:
#Normalize embeddings using PyTorch
#You can normalize embeddings in PyTorch using torch.nn.functional.normalize(). If your data is in a NumPy array, first convert it to a PyTorch tensor using torch.from_numpy(). After normalization, convert the tensor back to a NumPy array using the numpy() method:

In [51]:
normalized_embeddings_torch = torch.nn.functional.normalize(
    torch.from_numpy(embeddings)
).numpy()
normalized_embeddings_torch

array([[-0.03925694, -0.04243023, -0.00054956, ...,  0.07837522,
         0.10917508,  0.09252975],
       [-0.05740863, -0.05146182,  0.02560457, ..., -0.01130911,
         0.1487687 ,  0.05514007],
       [ 0.03326033, -0.0440653 ,  0.02667837, ..., -0.03219225,
        -0.00553686,  0.09757371],
       [-0.00740946, -0.07944359, -0.01655278, ..., -0.10083131,
         0.02996998,  0.0158601 ]], shape=(4, 384), dtype=float32)

In [52]:
#We can verify that the normalized embeddings we calculated manually and the normalized embeddings calculated using torch are indeed identical using numpy's allclose() function:

In [53]:
np.allclose(normalized_embeddings_manual, normalized_embeddings_torch)

True

In [54]:
#Calculate cosine similarity manually¶
#To calculate cosine similarity between two normalized embedding vectors, we simply take their dot product. To do this, we can leverage the dot product function we defined before. For instance, the following calculates the cosine similarity between the vector embeddings of the first and second sentence:

In [55]:
dot_product_fn(normalized_embeddings_manual[0], normalized_embeddings_manual[1])

np.float32(0.5117967)

In [56]:
#Likewise, to calculate the cosine similarities between all normalized vectors, we can use a nested loop:

In [57]:
cosine_similarity_manual = np.empty([4,4])
for i in range(normalized_embeddings_manual.shape[0]):
    for j in range(normalized_embeddings_manual.shape[0]):
        cosine_similarity_manual[i,j] = dot_product_fn(
            normalized_embeddings_manual[i], 
            normalized_embeddings_manual[j]
        )

cosine_similarity_manual

array([[1.00000012, 0.51179671, 0.24167804, 0.23423393],
       [0.51179671, 1.00000024, 0.20731851, 0.24144739],
       [0.24167804, 0.20731851, 1.00000083, 0.50295591],
       [0.23423393, 0.24144739, 0.50295591, 1.00000024]])

In [58]:
#Cosine similarity ranges from -1 to 1. The cosine similarity matrix is symmetric, just like the matrices for L2 distance and the dot product. In this example, cosine similarity performed well: the first two sentences were most similar to each other, as were the last two—matching our expectations.

In [59]:
#Calculate cosine similarity using matrix multiplication
#Just like with the dot product, we can compute cosine similarity using matrix algebra. By multiplying the matrix of normalized embeddings with its transpose using the matrix multiplication operator, we obtain the cosine similarity matrix. This works because, once vectors are normalized, cosine similarity can be calculated by simply taking the dot product:

In [60]:
cosine_similarity_operator = normalized_embeddings_manual @ normalized_embeddings_manual.T
cosine_similarity_operator

array([[0.99999994, 0.5117967 , 0.24167815, 0.23423405],
       [0.5117967 , 1.        , 0.20731854, 0.24144736],
       [0.24167815, 0.20731854, 1.0000002 , 0.5029561 ],
       [0.23423405, 0.24144736, 0.5029561 , 1.0000001 ]], dtype=float32)

In [61]:
#We can verify that the matrix algebra solution is the same as the one found using the nested loop:

In [62]:
np.allclose(cosine_similarity_manual, cosine_similarity_operator)

True

In [64]:
#Calculate cosine distance¶

#Using numpy, this can be calculated as follows:
1 - cosine_similarity_manual

array([[-1.19209290e-07,  4.88203287e-01,  7.58321956e-01,
         7.65766069e-01],
       [ 4.88203287e-01, -2.38418579e-07,  7.92681485e-01,
         7.58552611e-01],
       [ 7.58321956e-01,  7.92681485e-01, -8.34465027e-07,
         4.97044086e-01],
       [ 7.65766069e-01,  7.58552611e-01,  4.97044086e-01,
        -2.38418579e-07]])

In [65]:
# Exercise 3 - Similarity Search Using a Query

# In the above examples, we calculated similarity between 4 documents:

documents = [
    'Bugs introduced by the intern had to be squashed by the lead developer.',
    'Bugs found by the quality assurance engineer were difficult to debug.',
    'Bugs are common throughout the warm summer months, according to the entomologist.',
    'Bugs, in particular spiders, are extensively studied by arachnologists.'
]

#Now, your task is to find which of these 4 documents is most similar to the query Who is responsible for a coding project and fixing others' mistakes? using cosine similarity. You can reuse the documents and normalized_embeddings_manual arrays in your answer:

In [66]:
# First, embed the query:
query_embedding = model.encode(
    ["Who is responsible for a coding project and fixing others' mistakes?"]
)

# Second, normalize the query embedding:
normalized_query_embedding = torch.nn.functional.normalize(
    torch.from_numpy(query_embedding)
).numpy()

# Third, calculate the cosine similarity between the documents and the query by using the dot product:
cosine_similarity_q3 = normalized_embeddings_manual @ normalized_query_embedding.T

# Fourth, find the position of the vector with the highest cosine similarity:
highest_cossim_position = cosine_similarity_q3.argmax()

# Fifth, find the document in that position in the `documents` array:
documents[highest_cossim_position]

# As you can see, the query retrieved the document `Bugs introduced by the intern had to be squashed by the lead developer.` which is what we would expect.

'Bugs introduced by the intern had to be squashed by the lead developer.'