Cell 1: Import necessary libraries and set up logging


In [4]:
!pip install sentence-transformers


Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/58/4b/922436953394e1bfda05e4bf1fe0e80f609770f256c59a9df7a9254f3e0d/sentence_transformers-3.0.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
[0m

In [6]:
# Import logging utilities to control the verbosity of the output from transformers
from transformers.utils import logging

# Set logging level to only display errors to keep the output clean
logging.set_verbosity_error()

# Import the SentenceTransformer class to load the pre-trained model
from sentence_transformers import SentenceTransformer


Cell 2: Loading the model and encoding the first set of sentences


In [9]:
# Load the pre-trained 'all-MiniLM-L6-v2' model from Hugging Face's Sentence-Transformers library
# This model generates sentence embeddings for natural language processing tasks
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define the first set of sentences to generate embeddings
sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']

# Encode the first set of sentences into embeddings
# The parameter 'convert_to_tensor=True' ensures that the output is a tensor, suitable for PyTorch operations
embeddings1 = model.encode(sentences1, convert_to_tensor=True)

# Print the embeddings to verify that the model has processed the sentences
print(embeddings1)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1043, -0.0628,  0.0093,  ...,  0.0020,  0.0653, -0.0150]],
       device='mps:0')


Cell 3: Encoding the second set of sentences

In [12]:
# Define the second set of sentences to generate embeddings
sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

# Encode the second set of sentences into embeddings
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Print the embeddings to verify that the model has processed the sentences
print(embeddings2)


tensor([[ 0.0163, -0.0700,  0.0384,  ...,  0.0447,  0.0254, -0.0023],
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389]],
       device='mps:0')


Cell 4: Computing cosine similarity

In [17]:
# Import the utility functions from the sentence_transformers library
from sentence_transformers import util

# Compute cosine similarity between the two sets of sentence embeddings
# This will generate a similarity score for each pair of sentences from the two sets
cosine_scores = util.cos_sim(embeddings1, embeddings2)

# Print the cosine similarity matrix
print(cosine_scores)


tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [-0.0124, -0.0465,  0.6571]], device='mps:0')


Cell 5: Displaying individual cosine similarity scores

In [19]:
# Loop over the length of sentences1 to print the cosine similarity for each sentence pair
# {:.4f} formats the score to 4 decimal places for readability
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences2[i],
                                                 cosine_scores[i][i]))


The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The movies are awesome 		 The new movie is so great 		 Score: 0.6571
