In [19]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch
import pickle
import pandas as pd

In [20]:
with open('../test_data/wiki_hard.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data)

corpus = [
    doc
    for doc in df['text']
]
corpus

 'John Quincy Adams (; July 11, 1767 – February 23, 1848) was an American statesman, diplomat, lawyer, and diarist, who served as the 6th president of the United States from 1825 to 1829. He previously served as the 8th United States Secretary of State from 1817 to 1825. During his long diplomatic and political career, Adams also served as an ambassador, and as a member of the United States Senate and House of Representatives representing Massachusetts. He was the eldest son of John Adams, who served as the second U.S. president from 1797 to 1801, and First Lady Abigail Adams. Initially a Federalist like his father, he won election to the presidency as a member of the Democratic-Republican Party, and in the mid-1830s became affiliated with the Whig Party.\n\nBorn in Braintree, Massachusetts (now part of the town of Quincy), Adams spent much of his youth in Europe, where his father served as a diplomat. After returning to the United States, Adams established a successful legal practice 

In [24]:
embedder = SentenceTransformer('all-mpnet-base-v2')
# corpus
with open('../test_data/wiki_hard.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data)

corpus = [
    doc
    for doc in df['text']
]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
# queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']





Query: Types of Chinese Noodles

Top 5 most similar sentences in corpus:
L i a n g p i   ( ) (Score: 0.5098)
P a n c i t   (   ) (Score: 0.4891)
i s   a   J a p a n (Score: 0.4877)
Y a k i s o b a   ( (Score: 0.4646)
U d o n   (   o r   (Score: 0.4356)


In [27]:
# define aux search function that takes in a query and returns the top 5 results
def search(query):
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in [query]:
        query_embedding = embedder.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar sentences in corpus:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(' '.join(corpus[idx][:100]), "(Score: {:.4f})".format(score))

In [28]:
search('Types of Chinese noodles')





Query: Types of Chinese noodles

Top 5 most similar sentences in corpus:
L i a n g p i   ( )   i s   a   n o o d l e - l i k e   C h i n e s e   d i s h   m a d e   f r o m   w h e a t   o r   r i c e   f l o u r .   I t   i s   a   s p e c i a l t y   d i s h   o r i g i (Score: 0.5098)
P a n c i t   (   ) ,   a l s o   s p e l l e d   p a n s í t ,   i s   a   g e n e r a l   t e r m   r e f e r r i n g   t o   v a r i o u s   t r a d i t i o n a l   n o o d l e   d i s h e s   i n (Score: 0.4891)
i s   a   J a p a n e s e   n o o d l e   s o u p .   I t   c o n s i s t s   o f   C h i n e s e - s t y l e   w h e a t   n o o d l e s   s e r v e d   i n   a   m e a t   o r   ( o c c a s i o n a (Score: 0.4877)
Y a k i s o b a   (   ) ,   " f r i e d   n o o d l e " ,   i s   a   J a p a n e s e   n o o d l e   s t i r - f r y   d i s h .   U s u a l l y   s o b a   m e a n s   b u c k w h e a t ,   b u t   (Score: 0.4646)
U d o n   (   o r   )   i s   a   t h i c k   n o o d l e  

In [30]:
search('Information on bull dog')





Query: Information on bull dog

Top 5 most similar sentences in corpus:
T h e   G o l d e n   R e t r i e v e r   i s   a   B r i t i s h   b r e e d   o f   r e t r i e v e r   d o g   o f   m e d i u m   s i z e .   I t   i s   c h a r a c t e r i s e d   b y   a   g e (Score: 0.4835)
T h e     B u l l d o g ,   a l s o   k n o w n   a s   t h e   E n g l i s h   B u l l d o g   o r   B r i t i s h   B u l l d o g ,   i s   a   m e d i u m - s i z e d   d o g   b r e e d .   I t   (Score: 0.4805)
T h e   b e a g l e   i s   a   b r e e d   o f   s m a l l   s c e n t   h o u n d ,   s i m i l a r   i n   a p p e a r a n c e   t o   t h e   m u c h   l a r g e r   f o x h o u n d .   T h e   b (Score: 0.4499)
T h e   G e r m a n   S h e p h e r d   o r   G e r m a n   S h e p h e r d   D o g ,   a l s o   k n o w n   a s   t h e   A l s a t i a n ,   i s   a   G e r m a n   b r e e d   o f   w o r k i n g (Score: 0.4477)
T h e   R o t t w e i l e r   ( ,     )   i s   a   b r e e 