In [3]:
#!pip install -U sentence-transformers

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [8]:
import os
os.environ["SENTENCE_TRANSFORMERS_HOME"] = r'D:\AI-DATASETS\07-Hugging-Face-Data\sentence-transformers'

##### all-MiniLM-L6-v2

- It maps sentences & paragraphs to a `384 dimensional` dense vector space and 
- can be used for tasks like `clustering` or `semantic search`.

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

#### create embeddings

In [10]:
# Create embeddings for sentences
sentences = ['This is an example.', 
             'Sentence embeddings are useful!',
             'Doddaballapur Road']

embeddings = model.encode(sentences)

embeddings.shape

(3, 384)

#### Comparing Sentence Similarities

In [11]:
#Sentences are encoded by calling model.encode()
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")

# Normalize each embedding
emb1_norm = emb1 / np.linalg.norm(emb1)
emb2_norm = emb2 / np.linalg.norm(emb2)

# Calculate cosine similarity by dot product
cos_sim = np.dot(emb1_norm, emb2_norm)

print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: 0.6153064


#### with more sentences

In [12]:
from sentence_transformers import SentenceTransformer, util

In [13]:
sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]

In [14]:
#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

In [15]:
#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

Top-5 most similar pairs:
A man is eating food. 	 A man is eating a piece of bread. 	 0.7553
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	 0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	 0.6433
A woman is playing violin. 	 Someone in a gorilla costume is playing a set of drums. 	 0.2564
A man is eating food. 	 A man is riding a horse. 	 0.2474


#### Semantic Textual Similarity

In [16]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

In [17]:
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

In [18]:
#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

In [19]:
#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
