In [27]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn as nn
import numpy as np

In [2]:
# declare the embedder object, with the specified name
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

In [4]:
embedder.__dict__

{'prompts': {},
 'default_prompt_name': None,
 '_similarity_fn_name': None,
 'trust_remote_code': False,
 'truncate_dim': None,
 'model_card_data': SentenceTransformerModelCardData(language=[], license=None, model_name=None, model_id=None, train_datasets=[], eval_datasets=[], task_name='semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more', tags=['sentence-transformers', 'sentence-similarity', 'feature-extraction'], generate_widget_examples='deprecated', base_model='sentence-transformers/all-MiniLM-L6-v2', base_model_revision='c9745ed1d9f207416be6d2e6f8de32d1f16199bf', non_default_hyperparameters={}, all_hyperparameters={}, eval_results_dict={}, training_logs=[], widget=[], predict_example=None, label_example_list=[], code_carbon_callback=None, citations={}, best_model_step=None, first_save=True, widget_step=-1, pipeline_tag='sentence-similarity', library_name='sentence-transformers', version={'python': '3.12.9', 'sentence_transform

In [5]:
# get the embeddings of the given sentences in the list
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [6]:
corpus_embeddings.shape

torch.Size([9, 384])

In [7]:
# Query sentences:
queries = ['A man is eating pasta.', 
           'Someone in a gorilla costume is playing a set of drums.', 
           'A cheetah chases prey on across a field.']

In [8]:
# find the cosine similarity between any query and any sentence
query_embedding = embedder.encode(queries[0], convert_to_tensor=True)

In [9]:
# compute cosine similarity between the two tensors
cos_sim = util.cos_sim(query_embedding, corpus_embeddings)

In [10]:
np.__version__

'1.26.4'

In [70]:
np.__version__

'2.2.4'

In [11]:
cos_sim.to('cpu').numpy()

array([[ 0.70354855,  0.5271986 , -0.08980075,  0.1888955 ,  0.03359391,
        -0.05943479,  0.10469925,  0.08189061,  0.09803035]],
      dtype=float32)

In [23]:
corpus

['A man is eating food.',
 'A man is eating a piece of bread.',
 'The girl is carrying a baby.',
 'A man is riding a horse.',
 'A woman is playing violin.',
 'Two men pushed carts through the woods.',
 'A man is riding a white horse on an enclosed ground.',
 'A monkey is playing drums.',
 'A cheetah is running behind its prey.']

In [24]:
np.argsort(cos_sim.cpu())

tensor([[2, 5, 4, 7, 8, 6, 3, 1, 0]])

In [None]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """

In [25]:
encodings1 = embedder.encode(['blood', 'lukewarm', 'academia', 'blood cancer', 'arrhythmia'])
encodings2 = embedder.encode(['leukemia'])

In [26]:
util.cos_sim(encodings1, encodings2)

tensor([[0.3584],
        [0.0726],
        [0.0387],
        [0.6106],
        [0.1260]])