In [1]:
import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased')

In [2]:
# Corpus with example sentences, some sentences were swapped to French, Italian and German
corpus = ['Un homme mange de la nourriture.', #FR 'A man is eating food.',
          'A man is eating a piece of bread.',
          'Das Mädchen trägt ein Baby.', #DE 'Tahe girl is carrying a baby.',
          'A man is riding a horse.',
          'An elderly man is enjoying dinner.',
          'Amis partageant du vin dans un restaurant.', #FR 'Friends sharing wine at a restaurant.',
          'A woman is playing violin.',
          'A child is learning to play a base guitar.',
          'Due uomini hanno spinto i carrelli attraverso i boschi.', #IT 'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'Una scimmia suona la batteria.', #IT 'A monkey is playing drums.',
          'A cheetah is running behind its prey.']

corpus_embeddings = model.encode(corpus)

In [3]:
# Sample queries to find similar sentences to, some sentences were translated to Russian and German.
queries = ['A man is eating pasta.', 
           'Кто-то в костюме гориллы играет на барабане', #RU 'Someone in a gorilla costume is playing a set of drums.', 
           'Ein Gepard jagt Beute über ein Feld.'] #DE 'A cheetah chases prey on across a field.']
query_embeddings = model.encode(queries)

In [4]:
# Calculate Cosine similarity of query against each sentence i
closest_n = 3
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n======================\n")
    print("Query:", query)
    print("\nTop 3 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))



Query: A man is eating pasta.

Top 3 most similar sentences in corpus:
Un homme mange de la nourriture. (Score: 0.7106)
A man is eating a piece of bread. (Score: 0.6306)
An elderly man is enjoying dinner. (Score: 0.4808)


Query: Кто-то в костюме гориллы играет на барабане

Top 3 most similar sentences in corpus:
Una scimmia suona la batteria. (Score: 0.5813)
A woman is playing violin. (Score: 0.3823)
A child is learning to play a base guitar. (Score: 0.3341)


Query: Ein Gepard jagt Beute über ein Feld.

Top 3 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.4491)
Una scimmia suona la batteria. (Score: 0.3389)
Due uomini hanno spinto i carrelli attraverso i boschi. (Score: 0.3238)
