<a href="https://colab.research.google.com/github/simodepth/Similarity-Tool/blob/main/Snippet_Similarity_with_Top_Queries_with_Sentence_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> Strike the Snippet Similarity based on Search Queries </h1>

> <h2> What this framework does</h2>
<p>
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
</p>

> <h2> Reference </h2> 
https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search.py

In [1]:
!pip install sentence_transformers
!pip install ecommercetools
!pip install google-search-results

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.5 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 61.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 57.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 55.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.2 MB/s 
Building wheels for collected 

In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

#scraping
import requests
from serpapi import GoogleSearch
import urllib
import urllib.parse
import json
from urllib.parse import (parse_qsl, urlsplit)
from requests_html import HTML
from requests_html import HTMLSession


#data manipulation
import pandas as pd
import numpy as np


#libraries for preprocessing tasks
from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize


#Download once if using NLTK for preprocessing
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def get_source(url):

    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print(e)

def get_results(query):
    query = urllib.parse.quote_plus(query)
    response = get_source("https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q=" + query)
    results = json.loads(response.text)
    return results



In [4]:
from ecommercetools import seo
import pandas as pd

suggestions = seo.google_autocomplete('coffee', include_expanded=True)
queries = pd.DataFrame(suggestions)
queries.to_csv('queries.csv')
queries.head(10)

Unnamed: 0,term,relevance
0,coffee near me,1250
1,cheap coffee tables,1250
2,when coffee meets bagel,1250
3,why coffee makes you poop,1250
4,how coffee is made,1250
5,coffee bean,1250
6,coffee jelly,1250
7,coffee meets bagel,1250
8,coffee near me,1250
9,coffee quotes,1250


In [5]:
results = queries.drop_duplicates('term')
results.to_csv('results.csv')
results.head(10)

Unnamed: 0,term,relevance
0,coffee near me,1250
1,cheap coffee tables,1250
2,when coffee meets bagel,1250
3,why coffee makes you poop,1250
4,how coffee is made,1250
5,coffee bean,1250
6,coffee jelly,1250
7,coffee meets bagel,1250
9,coffee quotes,1250
10,coffee request crossword,1250


In [6]:
from serpapi import GoogleSearch

serp_apikey = "######" 

params = {
    "engine": "google",
    "q": "coffee",
    "location": "United Kingdom",
    "google_domain": "google.com",
    "gl": "uk",
    "hl": "en",
    "num": 10,
    "api_key": serp_apikey
}

client = GoogleSearch(params)
data = client.get_dict()

# access "organic results"
df = pd.DataFrame(data['organic_results'])
df.to_csv('results_1.csv', index=False)
df

https://serpapi.com/search


Unnamed: 0,position,title,link,displayed_link,thumbnail,snippet,snippet_highlighted_words,sitelinks,rich_snippet,about_this_result,about_page_link,about_page_serpapi_link,cached_page_link,related_pages_link,date
0,1,Coffee - Wikipedia,https://en.wikipedia.org/wiki/Coffee,https://en.wikipedia.org › wiki › Coffee,https://serpapi.com/searches/6357e004216a9d3cd...,Coffee is a drink prepared from roasted coffee...,"[Coffee, coffee, coffee]","{'inline': [{'title': 'Coffee bean', 'link': '...",{'bottom': {'extensions': ['Region of origin: ...,{'source': {'description': 'Wikipedia is a mul...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,https://www.google.com/search?gl=uk&hl=en&q=re...,
1,2,Starbucks: Homepage,https://www.starbucks.co.uk/,https://www.starbucks.co.uk,,About us. About Us · Our Coffees · Starbucks S...,"[Coffees, Coffee]",,,{'source': {'description': 'Starbucks Corporat...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,https://www.google.com/search?gl=uk&hl=en&q=re...,
2,3,Coffee Beans | Blended Coffee | Whittard of Ch...,https://www.whittard.co.uk/coffee,https://www.whittard.co.uk › coffee,,Whether you prefer whole coffee beans or fresh...,"[coffee, coffee, coffees]","{'inline': [{'title': 'All Coffee', 'link': 'h...",,{'source': {'description': 'Whittard of Chelse...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,,
3,4,Costa Coffee: The Nation's Favourite Coffee Shop,https://www.costa.co.uk/,https://www.costa.co.uk,,Costa is the Nation's Favourite coffee shop an...,"[coffee, coffee]",,,{'source': {'description': 'Costa Coffee is a ...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,,
4,5,Origin Coffee Roasters: Speciality Coffee Onli...,https://www.origincoffee.co.uk/,https://www.origincoffee.co.uk,https://encrypted-tbn0.gstatic.com/images?q=tb...,Leading UK speciality coffee roaster with a fo...,"[coffee, coffee]","{'inline': [{'title': 'Speciality Coffee', 'li...",,{'source': {'description': 'origincoffee.co.uk...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,https://www.google.com/search?gl=uk&hl=en&q=re...,
5,6,Pact Coffee: The UK's Best Coffee Delivery & S...,https://www.pactcoffee.com/,https://www.pactcoffee.com,https://encrypted-tbn0.gstatic.com/images?q=tb...,Kick-start your morning with our amazing range...,"[coffee, coffee]",,,{'source': {'description': 'pactcoffee.com was...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,https://www.google.com/search?gl=uk&hl=en&q=re...,
6,7,Coffee | The Nutrition Source,https://www.hsph.harvard.edu/nutritionsource/f...,https://www.hsph.harvard.edu › ... › Food Feat...,https://encrypted-tbn0.gstatic.com/images?q=tb...,Coffee beans are the seeds of a fruit called a...,"[Coffee, coffee, Coffee, coffee]",,,{'source': {'description': 'The Harvard T.H. C...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,,
7,8,Coffee Beans: Where Do They Come From? | Nescafé,https://www.nescafe.com/gb/understanding-coffe...,https://www.nescafe.com › Home › Understanding...,https://encrypted-tbn0.gstatic.com/images?q=tb...,"On these coffee plants, bunches of cherries gr...","[coffee, coffee, coffee, coffee]",,,{'source': {'description': 'This result comes ...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,,
8,9,Coffee Beans – Best in the UK - Coffee-Direct,https://www.coffee-direct.co.uk/collections/al...,https://www.coffee-direct.co.uk › collections ...,https://encrypted-tbn0.gstatic.com/images?q=tb...,Coffee Beans. Browse through our entire range ...,"[Coffee, coffees]",,,{'source': {'description': 'coffee-direct.co.u...,https://www.google.com/search?q=About+https://...,https://serpapi.com/search.json?engine=google_...,https://webcache.googleusercontent.com/search?...,https://www.google.com/search?gl=uk&hl=en&q=re...,116 results


In [7]:
SERP_One = pd.read_csv('/content/results_1.csv')
Snippet = pd.DataFrame(SERP_One, columns=['snippet'])
Snippet.to_csv('corpus.csv', index=False)
Snippet

Unnamed: 0,snippet
0,Coffee is a drink prepared from roasted coffee...
1,About us. About Us · Our Coffees · Starbucks S...
2,Whether you prefer whole coffee beans or fresh...
3,Costa is the Nation's Favourite coffee shop an...
4,Leading UK speciality coffee roaster with a fo...
5,Kick-start your morning with our amazing range...
6,Coffee beans are the seeds of a fruit called a...
7,"On these coffee plants, bunches of cherries gr..."
8,Coffee Beans. Browse through our entire range ...


In [20]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

corpus = [
    'About us. About Us · Our Coffees · Starbucks Stories & News · Starbucks® Ready to Drink · Foodservice Coffee · Customer Service · Tax Strategy 2022 · Careers.',
    'Costa is the Nation Favourite coffee shop and the largest and fastest growing coffee shop chain in the UK.',
    'Leading UK speciality coffee roaster with a focus on sustainability. B Corp certified. Become a wholesale partner or buy coffee beans online today.',
    'Kick-start your morning with our amazing range of speciality coffee and equipment. World-class coffee, direct from the farmer, delivered free every time.',
    'Coffee Direct - Freshly roasted coffee beans delivered to your door. Origin coffee, coffee blends and flavoured coffee for bean-to-cup',
    'Whether you prefer whole coffee beans or freshly ground coffee, Whittard of Chelsea selection of light, medium and dark roast luxury coffees has something',
    'Coffee beans are the seeds of a fruit called a coffee cherry. Coffee cherries grow on coffee trees from a genus of plants called Coffea.',
    'On these coffee plants, bunches of cherries grow and inside these you will find two coffee beans, Arabica and Robusta coffee.',
]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['coffee', 'coffee near me', 'coffee bean', 'coffee house', 'coffee jelly', 'coffee order nyt crossword clue',
           'coffee quotes', 'coffee shops near me']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
query_result = list()  
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    for score, idx in zip(top_results[0], top_results[1]):
        corpus[idx], "(Score: {:.4f})".format(score)
        query_result.append([query, corpus[idx], score])

df = pd.DataFrame(query_result,columns=['Query','Snippet','Score']) 
df.to_csv('results.csv',index=False)
df


Unnamed: 0,Query,Snippet,Score
0,coffee,Coffee Direct - Freshly roasted coffee beans d...,tensor(0.6477)
1,coffee,Whether you prefer whole coffee beans or fresh...,tensor(0.5873)
2,coffee,Kick-start your morning with our amazing range...,tensor(0.5739)
3,coffee,Coffee beans are the seeds of a fruit called a...,tensor(0.4985)
4,coffee,Costa is the Nation Favourite coffee shop and ...,tensor(0.4374)
5,coffee near me,Whether you prefer whole coffee beans or fresh...,tensor(0.5395)
6,coffee near me,Kick-start your morning with our amazing range...,tensor(0.5306)
7,coffee near me,Coffee Direct - Freshly roasted coffee beans d...,tensor(0.5256)
8,coffee near me,Costa is the Nation Favourite coffee shop and ...,tensor(0.4563)
9,coffee near me,Coffee beans are the seeds of a fruit called a...,tensor(0.4151)


In [49]:
#@title  Single out Queries by Rows
query_1 = df.loc[df['Query'] == 'coffee']
query_2 = df.loc[df['Query'] == 'coffee near me']
query_3 = df.loc[df['Query'] == 'coffee bean']
query_4 = df.loc[df['Query'] == 'coffee house']
query_5 = df.loc[df['Query'] == 'coffee jelly']
query_6 = df.loc[df['Query'] == 'coffee order nyt crossword clue']
query_7 = df.loc[df['Query'] == 'coffee quotes']

##Choose the Query to visualize the Snippet Similarity on

In [46]:
query_3.to_csv('coffee bean.csv', index=False)

In [53]:
Data = pd.read_csv('coffee bean.csv')
Data['Score'] = Data['Score'].str.replace("tensor\(|\)", "")
Data['Score'] = Data['Score'].astype(float)
Data

  


Unnamed: 0,Query,Snippet,Score
0,coffee bean,Coffee Direct - Freshly roasted coffee beans d...,0.6698
1,coffee bean,Coffee beans are the seeds of a fruit called a...,0.6322
2,coffee bean,Whether you prefer whole coffee beans or fresh...,0.5798
3,coffee bean,Kick-start your morning with our amazing range...,0.535
4,coffee bean,"On these coffee plants, bunches of cherries gr...",0.5313
