In [None]:
pip install sentence_transformers

Collecting sentence_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f4/fd/0190080aa0af78d7cd5874e4e8e85f0bed9967dd387cf05d760832b95da9/sentence-transformers-0.3.8.tar.gz (66kB)
[K     |████████████████████████████████| 71kB 1.6MB/s 
[?25hCollecting transformers<3.4.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.1MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 15.4MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp

In [None]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd

In [None]:
# load SBERT
model = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:18<00:00, 21.5MB/s]


In [None]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

In [None]:
# A corpus is a list with documents split by sentences.
BASE_DIR = '/content'
TEXT_DATA_DIR = os.path.join(BASE_DIR, '')
NEWS_FILE_NAME = "abcnews-date-text.csv"



input_df = read_csv(os.path.join(TEXT_DATA_DIR, NEWS_FILE_NAME))
#use only top 20000 records - can use powerful machine for more records 
input_df = input_df.head(20000)
print(input_df.head(20))

sentences = input_df['headline'].values.tolist()

#sentences = ['aba decides against community broadcasting licence', 
#             'act fire witnesses must be aware of defamation',
#             'a g calls for infrastructure protection summit',
#             'air nz staff in aust strike for pay rise',
#             'air nz strike to affect australian travellers',
#             'ambitious olsson wins triple jump',
#             'antic delighted with record breaking barca',
#             'aussie qualifier stosur wastes four memphis match',
#             'aust addresses un security council over iraq',
#             'australia is locked into war timetable opp',
#             'australia to contribute 10 million in aid to iraq']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

#only for top 20 rows
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

          Category  ...                                            Content
0             hdgc  ...  another-battle-lost-4 Hi all! Just writing you...
1             hdgc  ...  tpn-and-chemo-1 Hi all, My 40 year old husband...
2             hdgc  ...  diffuse-gastric-cancer Is there anybody out th...
3             hdgc  ...  cancer-breakthrough-paves-way-to-better-gene-t...
4             hdgc  ...  linitis-plastica My husband Tony was diagnosed...
5             hdgc  ...  drugs-to-treat-hereditary-gastric-cancers-foun...
6             hdgc  ...  fluorescence-microscopy-new-2-d-images-can-det...
7             hdgc  ...  gastroenterologists-gastric-surgeons-at-johns-...
8             hdgc  ...  can-stomach-cancer-be-hereditary Recent resear...
9   newlyDiagnosed  ...  recent-diagnosis-signet-ring-cell-gastric-canc...
10  newlyDiagnosed  ...  has-anyones-loved-one-experienced-peritoneal-m...
11  newlyDiagnosed  ...  please-give-some-positive-stories-or-tips Has ...
12  newlyDiagnosed  ...  

In [None]:
# Semantic Search 

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

#query = 'bad weather' #@param {type: 'string'}
#query = 'employee stop working' #@param {type: 'string'}
#query = "moderate lift in economy"
#query = 'global warming impact'
#query = 'wildfires in australia'
query = 'That’s lovely, thank you. I need all the positive feedback I can get. I’m new to actually using this page although I have read lots of posts. I am now a member. Look forward to hearing from you at some point. Thanks, Maggie'

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 3 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    # cosine similarity
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    #sort in ascending order
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 3 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: That’s lovely, thank you. I need all the positive feedback I can get. I’m new to actually using this page although I have read lots of posts. I am now a member. Look forward to hearing from you at some point. Thanks, Maggie

Top 3 most similar sentences in corpus:
hipec-procedure-4 Hello, I would be interested to hear from anyone who has had a gastrectomy and the HIPEC procedure - in particular how the recovery was from the HIPEC procedure. Thank you ... (Cosine Score: 0.6499)
linitis-plastica-treatment-infomation Hi, I am new to this site. My wife just diagnosed Linitis Plastica and I am looking for any helpful information. Thanks, Peter ... (Cosine Score: 0.6275)
stomach-cancer-experience-registry Dear DDF Community - If you are a stomach cancer patient, survivor, or caregiver you know how important it is to share your experience with others and get information and feedback. We are asking for your help to better understand your individual experience