Sentence Embeddings

In [86]:
# !pip install sentence-transformers

In [87]:
from transformers.utils import logging
logging.set_verbosity_error()
import torch
from transformers import pipeline
from sentence_transformers import util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer

In [88]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [89]:
sentences1 = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]
embeddings1 = model.encode(sentences1, convert_to_tensor=True)

In [90]:
sentences1

['That is a happy person',
 'That is a happy dog',
 'That is a very happy person',
 'Today is a sunny day']

In [91]:
print(embeddings1)

tensor([[-0.0339,  0.0919,  0.0487,  ..., -0.0144, -0.0275,  0.0448],
        [ 0.0050,  0.0632,  0.0142,  ...,  0.0404,  0.0758,  0.0909],
        [-0.0025,  0.0915,  0.0484,  ..., -0.0264, -0.0753,  0.0280],
        [-0.0163,  0.1041,  0.0974,  ...,  0.0068, -0.0879,  0.0340]],
       device='cuda:0')


In [92]:
sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great',
              'Today is a sunny day']

In [93]:
embeddings2 = model.encode(sentences2,
                           convert_to_tensor=True)

In [94]:
sentences2

['The dog plays in the garden',
 'A woman watches TV',
 'The new movie is so great',
 'Today is a sunny day']

In [95]:
print(embeddings2)

tensor([[ 0.0163, -0.0700,  0.0384,  ...,  0.0447,  0.0254, -0.0023],
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389],
        [-0.0163,  0.1041,  0.0974,  ...,  0.0068, -0.0879,  0.0340]],
       device='cuda:0')


In [96]:
sentences3 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome',
              'Today is a very wendy day']

In [97]:
embeddings3 = model.encode(sentences3, convert_to_tensor=True)

In [98]:
sentences3

['The cat sits outside',
 'A man is playing guitar',
 'The movies are awesome',
 'Today is a very wendy day']

In [99]:
embeddings3

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1043, -0.0628,  0.0093,  ...,  0.0020,  0.0653, -0.0150],
        [-0.0471,  0.0638,  0.0858,  ..., -0.0245, -0.0085,  0.0293]],
       device='cuda:0')

In [100]:
similarities = model.similarity(embeddings1, embeddings2)
print(similarities.shape)


torch.Size([4, 4])


In [101]:
similarities = model.similarity(embeddings2, embeddings3)
print(similarities.shape)


torch.Size([4, 4])


In [102]:
similarities = model.similarity(embeddings1, embeddings3)
print(similarities.shape)

torch.Size([4, 4])


Using the cosine similarity between two sentences as a measure of how similar they are to each other.

In [103]:
cosine_scores1 = util.cos_sim(embeddings1,embeddings2)
cosine_scores2 = util.cos_sim(embeddings2,embeddings3)
cosine_scores3 = util.cos_sim(embeddings1,embeddings3)

In [104]:
print(cosine_scores1)
print(cosine_scores2)
print(cosine_scores3)

tensor([[-0.0404,  0.0218,  0.1238,  0.2569],
        [ 0.2806, -0.0502,  0.2030,  0.2491],
        [-0.0431,  0.0375,  0.1456,  0.2106],
        [ 0.0981, -0.0091,  0.0929,  1.0000]], device='cuda:0')
tensor([[ 0.2838,  0.2277, -0.0124,  0.0294],
        [ 0.1310, -0.0327, -0.0465,  0.0065],
        [-0.0029, -0.0136,  0.6571,  0.1349],
        [ 0.1398, -0.0223,  0.0150,  0.4387]], device='cuda:0')
tensor([[ 0.0663,  0.0535,  0.1545,  0.2494],
        [ 0.1611,  0.0066,  0.1410,  0.2352],
        [ 0.0401,  0.0483,  0.1621,  0.2389],
        [ 0.1398, -0.0223,  0.0150,  0.4387]], device='cuda:0')


In [105]:
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences2[i],
                                                 cosine_scores1[i][i]))

That is a happy person 		 The dog plays in the garden 		 Score: -0.0404
That is a happy dog 		 A woman watches TV 		 Score: -0.0502
That is a very happy person 		 The new movie is so great 		 Score: 0.1456
Today is a sunny day 		 Today is a sunny day 		 Score: 1.0000


In [106]:
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences2[i],
                                                 sentences3[i],
                                                 cosine_scores2[i][i]))

The dog plays in the garden 		 The cat sits outside 		 Score: 0.2838
A woman watches TV 		 A man is playing guitar 		 Score: -0.0327
The new movie is so great 		 The movies are awesome 		 Score: 0.6571
Today is a sunny day 		 Today is a very wendy day 		 Score: 0.4387


In [107]:
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences3[i],
                                                 cosine_scores3[i][i]))

That is a happy person 		 The cat sits outside 		 Score: 0.0663
That is a happy dog 		 A man is playing guitar 		 Score: 0.0066
That is a very happy person 		 The movies are awesome 		 Score: 0.1621
Today is a sunny day 		 Today is a very wendy day 		 Score: 0.4387


Note that this is just a simple basics!