<a href="https://colab.research.google.com/github/sergiomar73/nlp-google-colab/blob/main/Embedding_Models_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# https://www.sbert.net/examples/applications/semantic-search/README.html#python

In [None]:
!pip install sentence_transformers

In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
# Corpus with example sentences
corpus = [
"10:30 works for me.",
"10 AM works for me.",
"10 o'clock AM sounds good.",
"11:30 works for me.",
"11 AM works for me.",
"11 o'clock AM sounds good.",
"12:30 works for me.",
"1:30 works for me.",
"1 o'clock PM sounds good.",
"1 PM works for me.",
"2:30 works for me.",
"2 o'clock PM sounds good.",
"2 PM works for me.",
"3:30 works for me.",
"3 o'clock PM sounds good.",
"3 PM works for me.",
"4:30 works for me.",
"4 o'clock PM sounds good.",
"4 PM works for me.",
"5 o'clock PM sounds good.",
"5 PM works for me.",
"9:30 works for me.",
"9 AM works for me.",
"9 o'clock AM sounds good.",
"Monday at 10:00 a.m.",
"Monday at 10:00 works for me.",
"Monday at 1:00 p.m.",
"Monday at 10:30 works for me.",
"Monday at 11:00 a.m.",
"Monday at 11:30 works for me.",
"Monday at 11 works for me.",
"Monday at 12:30 works for me.",
"Monday at 1:30 works for me.",
"Monday at 1 works for me.",
"Monday at 2:00 p.m.",
"Monday at 2:30 works for me.",
"Monday at 2 works for me.",
"Monday at 3:00 p.m.",
"Monday at 3:30 works for me.",
"Monday at 3 works for me.",
"Monday at 4:00 p.m.",
"Monday at 4:30 works for me.",
"Monday at 4 works for me.",
"Monday at 5:00 works for me.",
"Monday at 9:00 a.m.",
"Monday at 9:30 works for me.",
"Monday at 9 works for me.",
"Monday at noon.",
"Monday at noon works for me.",
"Monday works for me.",
]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [9]:
# Query sentences:
queries = ['02:00 on Monday works for me.', 'A cheetah chases prey on across a field.']

In [10]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")
    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))
    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """





Query: 02:00 on Monday works for me.

Top 5 most similar sentences in corpus:
Monday at 10:00 works for me. (Score: 0.9085)
Monday at 5:00 works for me. (Score: 0.9076)
Monday at 2:30 works for me. (Score: 0.8985)
Monday at 12:30 works for me. (Score: 0.8958)
Monday at 10:30 works for me. (Score: 0.8919)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
Monday at 1 works for me. (Score: 0.0813)
Monday at 1:00 p.m. (Score: 0.0732)
Monday at 3 works for me. (Score: 0.0708)
Monday at noon. (Score: 0.0691)
Monday at 3:00 p.m. (Score: 0.0661)
