<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/04_text_similarity_search_with_FAISS_SBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import torch
import faiss
from sentence_transformers import SentenceTransformer

In [None]:
!wget https://github.com/franciscadias/data/raw/master/abcnews-date-text.csv

##FAISS+SBERT

In [2]:
df=pd.read_csv("abcnews-date-text.csv")
data=df.headline_text.to_list()

In [3]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
len(df.headline_text)

1082168

In [5]:
data_df = df.headline_text[:50000]
len(data_df)

50000

In [6]:
data=data_df.to_list()

In [7]:
# Loading the pre-trained model 
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [8]:
encoded_data = model.encode(data)

In [9]:
# Let's define the index and add data to it
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data))))

In [10]:
# Serializing the index
faiss.write_index(index, 'abc_news')

In [11]:
# Deserializing the index
index = faiss.read_index('abc_news')

In [12]:
def search(query):
  t=time.time()
  query_vector = model.encode([query])
  k = 5
  top_k = index.search(query_vector, k)
  print('totaltime: {}'.format(time.time()-t))
  return [data[_id] for _id in top_k[1].tolist()[0]]

In [13]:
# performing the search
query="terrorrist attacks american army"
results=search(query)
print('results :')
for result in results:
   print('\t',result)

totaltime: 0.06334853172302246
results :
	 attacks strike us troops iraq foreign ministry
	 bbc chief attacks us media war coverage
	 us troops attack iraqi resistance
	 iraq says hitting us forces with artillery
	 terrorist attacks a strategic failure us


In [14]:
query="Underwater forest discovered"
results=search(query)
print('results :')
for result in results:
   print('\t',result)

totaltime: 0.05944657325744629
results :
	 diver drowns off mornington peninsula
	 underwater flinders island tas electricity grid
	 eels edge lang park thriller
	 vanuatu tests waters with scuba post office
	 portland centre moves closer to underwater display


In [15]:
query="meteorite contains the oldest material on earth"
results=search(query)
print('results :')
for result in results:
   print('\t',result)

totaltime: 0.06379151344299316
results :
	 egypt finds oldest evidence of mummification
	 oldest fossil of modern man found
	 wa govt buys was oldest mine shaft
	 nasa finds oldest planet ever
	 nikolayeva becomes oldest world champion


##Loading data

In [17]:
keyword_df = pd.read_csv("keywords.csv")
keyword_df.head()

Unnamed: 0,CATEGORY0,CATEGORY1,CATEGORY2,SYNID,KEYWORDS
0,Keywords,Respiratory,,KW-rESPIRATORY01,Respiratory
1,Keywords,Alcohol,Alcohol,KW-ALCOHOL184,struggling with alcohol
2,Keywords,Alcohol,Alcohol,KW-ALCOHOL185,suspected alcohol abuse
3,Keywords,Alcohol,Alcohol,KW-ALCOHOL186,taking medication for alcohol
4,Keywords,Alcohol,Alcohol,KW-ALCOHOL187,tequilla


In [18]:
ocr_sample = pd.read_csv("ocr_page.csv", encoding="cp1252")
ocr_sample.head()

Unnamed: 0,PAGEID,OCR_CONTENT,LOAD_TIMESTAMP
0,104707,PACIFIC LIFE INSURANCE COMPANY\n\nLife Insuran...,26-JAN-22 07.04.57.000000000 PM
1,104708,\r\n\r\nTEMPORARY INSURANCE AGREEMENT (TIA)\r...,26-JAN-22 07.04.57.000000000 PM
2,104709,PACIFIC LIFE INSURANCE COMPANY\r\n\r\nLife Ins...,26-JAN-22 07.04.57.000000000 PM
3,104710,\n\nNON-FORFEITURE PREMIUM CESSATION OPTIONS ...,26-JAN-22 07.04.57.000000000 PM
4,104711,\n\nADDITIONAL POLICYOWNER (If more than one ...,26-JAN-22 07.04.57.000000000 PM


In [19]:
ocr_sample["OCR_CONTENT"][1]

' \r\n\r\nTEMPORARY INSURANCE AGREEMENT (TIA)\r\n\r\nQuali?cations\r\nTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r\nquestions on the Temporary Insurance Agreement.\r\nLimits\r\n- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r\n- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r\nInitial Premium\r\nThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater of\r\none modal premium or three times the monthly initial minimum premium illustrated.\r\nSubmitting\r\n- It is preferred that the check for the TIA, application, and TIA form all have the same date.\r\n- If it is not possible to obtain a check at the time the application is completed, t

##Preparing query content

In [20]:
ocr_sample["OCR_CONTENT"][1].split("\n")[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [21]:
# converting text to sentences
sentences_list = []
sentences = ocr_sample["OCR_CONTENT"][1].split("\n")
for sentence in sentences:
  if sentence != "":
    sentences_list.append(sentence)
sentences_list[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [22]:
contents = ocr_sample["OCR_CONTENT"][1].replace("\r\n", "")
contents

' TEMPORARY INSURANCE AGREEMENT (TIA)Quali?cationsTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of thequestions on the Temporary Insurance Agreement.Limits- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001Initial PremiumThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater ofone modal premium or three times the monthly initial minimum premium illustrated.Submitting- It is preferred that the check for the TIA, application, and TIA form all have the same date.- If it is not possible to obtain a check at the time the application is completed, the check may be dated up to 3 days later than the TIA and ap

##Keyword Search

https://towardsdatascience.com/billion-scale-semantic-similarity-search-with-faiss-sbert-c845614962e2

In [23]:
keyword_df["KEYWORDS"].head()

0                      Respiratory
1          struggling with alcohol
2          suspected alcohol abuse
3    taking medication for alcohol
4                         tequilla
Name: KEYWORDS, dtype: object

In [24]:
def make_keyword_index(data_df):
  data=data_df["KEYWORDS"].to_list()
  # Encoding keywords
  encoded_data = model.encode(data)
  # Let's define the index and add data to it
  index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
  index.add_with_ids(encoded_data, np.array(range(0, len(data))))
  # Serializing the index
  faiss.write_index(index, 'abc_news')
  # Deserializing the index
  index = faiss.read_index('abc_news')
  return index

keyword_index = make_keyword_index(keyword_df)

In [25]:
def search(query):
  t=time.time()
  query_vector = model.encode([query])
  k = 5
  top_k = keyword_index.search(query_vector, k)
  print('totaltime: {}'.format(time.time()-t))
  return [data[_id] for _id in top_k[1].tolist()[0]]

In [28]:
# performing the search
results=search(ocr_sample["OCR_CONTENT"][0])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.019058704376220703
results :
	 beckham leads as man u cut down depleted juve
	 flight attendant action wont affect bundaberg
	 french expedition site to be heritage protected
	 opposition urged to help protect recherche bay
	 canegrowers hope for late summer rain


In [29]:
results=search(ocr_sample["OCR_CONTENT"][1])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017243385314941406
results :
	 opposition urged to help protect recherche bay
	 orientation begins for uni students
	 onesteel to invest 80m in whyalla steelworks
	 brigadier dismisses reports troops harassed in
	 flight attendant action wont affect bundaberg


In [30]:
results=search(ocr_sample["OCR_CONTENT"][2])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017183780670166016
results :
	 opposition urged to help protect recherche bay
	 beckham leads as man u cut down depleted juve
	 orientation begins for uni students
	 mp rejects ambulance levy claims
	 last minute call hands alinghi big lead


In [31]:
results=search(ocr_sample["OCR_CONTENT"][3])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.021990537643432617
results :
	 flight attendant action wont affect bundaberg
	 last minute call hands alinghi big lead
	 french expedition site to be heritage protected
	 health minister backs organ and tissue storage
	 blizzard buries united states in bills


In [32]:
results=search(ocr_sample["OCR_CONTENT"][4])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.020436525344848633
results :
	 martin to lobby against losing nt seat in fed
	 flight attendant action wont affect bundaberg
	 blizzard buries united states in bills
	 french expedition site to be heritage protected
	 last minute call hands alinghi big lead


In [36]:
query = "American celebrity public speech"
results=search("Each sentence is converted")
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.013369321823120117
results :
	 reading go third in first division
	 aussie qualifier stosur wastes four memphis match
	 police defend aboriginal tent embassy raid
	 refshauge wins defamation court case
	 code of conduct toughens organ donation regulations
