<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/06_text_similarity_with_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

In [None]:
!pip -q install beautifulsoup4

In [3]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import spacy
from spacy.matcher import PhraseMatcher

import bs4 as bs  
import urllib.request 
import nltk

##Spacy phrase matcher

Reference:

https://stackabuse.com/python-for-nlp-vocabulary-and-phrase-matching-with-spacy/

###Scraping data

In [4]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
  article_text += p.text
    
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

In [5]:
processed_article

' artificial intelligence ai is intelligence demonstrated by machines as opposed to the natural intelligence displayed by animals and humans ai research has been defined as the field of study of intelligent agents which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals a the term artificial intelligence had previously been used to describe machines that mimic and display human cognitive skills that are associated with the human mind such as learning and problem solving this definition has since been rejected by major ai researchers who now describe ai in terms of rationality and acting rationally which does not limit how intelligence can be articulated b ai applications include advanced web search engines e g google recommendation systems used by youtube amazon and netflix understanding human speech such as siri and alexa self driving cars e g tesla automated decision making and competing at the highest level in strate

###Creating Phrase Matcher

In [6]:
nlp = spacy.load("en_core_web_sm")
phrase_matcher = PhraseMatcher(nlp.vocab)

###Creating Phrase List

In [7]:
phrases = [
  "machine learning", "robots", "intelligent agents"
]

patterns = [nlp(text) for text in phrases]

In [8]:
# let's add new phrase into list
phrase_matcher.add("AI", None, *patterns)

###Applying Matcher to Document

In [9]:
# convert article into spacy document format
sentence = nlp(processed_article)

matched_phrases = phrase_matcher(sentence)

In [10]:
# see the string value of the matched phrases
for match_id, start, end in matched_phrases:
  string_id = nlp.vocab.strings[match_id]
  span = sentence[start: end]
  print(match_id, string_id, start, end, span.text)

5530044837203964789 AI 31 33 intelligent agents
5530044837203964789 AI 292 294 machine learning
5530044837203964789 AI 1215 1216 robots
5530044837203964789 AI 1352 1354 machine learning
5530044837203964789 AI 2053 2055 machine learning
5530044837203964789 AI 2550 2551 robots
5530044837203964789 AI 3672 3674 machine learning
5530044837203964789 AI 3686 3688 machine learning
5530044837203964789 AI 4847 4849 machine learning
5530044837203964789 AI 4869 4871 machine learning
5530044837203964789 AI 5321 5323 machine learning
5530044837203964789 AI 5348 5350 machine learning
5530044837203964789 AI 6565 6566 robots
5530044837203964789 AI 6734 6736 machine learning
5530044837203964789 AI 6813 6814 robots
5530044837203964789 AI 6814 6816 machine learning
5530044837203964789 AI 7201 7202 robots
5530044837203964789 AI 7760 7761 robots
5530044837203964789 AI 8320 8321 robots
5530044837203964789 AI 9047 9049 machine learning
5530044837203964789 AI 9524 9526 machine learning
5530044837203964789 AI 9

##Loading data

In [None]:
keyword_df = pd.read_csv("keywords.csv")
keyword_df.head()

Unnamed: 0,CATEGORY0,CATEGORY1,CATEGORY2,SYNID,KEYWORDS
0,Keywords,Respiratory,,KW-rESPIRATORY01,Respiratory
1,Keywords,Alcohol,Alcohol,KW-ALCOHOL184,struggling with alcohol
2,Keywords,Alcohol,Alcohol,KW-ALCOHOL185,suspected alcohol abuse
3,Keywords,Alcohol,Alcohol,KW-ALCOHOL186,taking medication for alcohol
4,Keywords,Alcohol,Alcohol,KW-ALCOHOL187,tequilla


In [None]:
len(set(keyword_df["KEYWORDS"].to_list()))

284

In [None]:
# remove duplicates and NaN
keywords = [word for word in list(set(keyword_df["KEYWORDS"].to_list())) if type(word) is str]

In [None]:
ocr_sample = pd.read_csv("ocr_page.csv", encoding="cp1252")
ocr_sample.head()

Unnamed: 0,PAGEID,OCR_CONTENT,LOAD_TIMESTAMP
0,104707,PACIFIC LIFE INSURANCE COMPANY\n\nLife Insuran...,26-JAN-22 07.04.57.000000000 PM
1,104708,\r\n\r\nTEMPORARY INSURANCE AGREEMENT (TIA)\r...,26-JAN-22 07.04.57.000000000 PM
2,104709,PACIFIC LIFE INSURANCE COMPANY\r\n\r\nLife Ins...,26-JAN-22 07.04.57.000000000 PM
3,104710,\n\nNON-FORFEITURE PREMIUM CESSATION OPTIONS ...,26-JAN-22 07.04.57.000000000 PM
4,104711,\n\nADDITIONAL POLICYOWNER (If more than one ...,26-JAN-22 07.04.57.000000000 PM


##Preparing query content

In [None]:
ocr_sample["OCR_CONTENT"][1].split("\n")[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
# converting text to sentences
sentences_list = []
sentences = ocr_sample["OCR_CONTENT"][1].split("\n")
for sentence in sentences:
  if sentence != "":
    sentences_list.append(sentence)
sentences_list[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
contents = ocr_sample["OCR_CONTENT"][1].replace("\r\n", "").split(".")
contents

[' TEMPORARY INSURANCE AGREEMENT (TIA)Quali?cationsTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of thequestions on the Temporary Insurance Agreement',
 'Limits- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001Initial PremiumThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater ofone modal premium or three times the monthly initial minimum premium illustrated',
 'Submitting- It is preferred that the check for the TIA, application, and TIA form all have the same date',
 '- If it is not possible to obtain a check at the time the application is completed, the check may be dated up to 3 days later th

##Encoding keywords

In [None]:
# Loading the pre-trained model 
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
# create keywords embeddings
keyword_embeddings = model.encode(keywords)

In [None]:
keyword_embeddings.shape

(284, 768)

In [None]:
len(keyword_embeddings[0])

768

##Create FAISS index

In [None]:
d = keyword_embeddings.shape[1]
d

768

In [None]:
# Let's define the index and add keywords to it
index = faiss.IndexFlatL2(d)
index.is_trained

True

In [None]:
index.add(keyword_embeddings)

In [None]:
index.ntotal

284

##Search keyword

Then search given a query and number of nearest neigbors to return k.

In [None]:
k = 10
xq = model.encode(["taking medication for alcohol"])

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[262 144 177  54  32 158 166 140 226 259]]
CPU times: user 823 µs, sys: 804 µs, total: 1.63 ms
Wall time: 1.11 ms


In [None]:
I.tolist()[0]

[262, 144, 177, 54, 32, 158, 166, 140, 226, 259]

In [None]:
keyword_df["KEYWORDS"].iloc[I.tolist()[0]]

262                                    no use of alcohol
144                                    excessive alcohol
177                                    hairline fracture
54     acting psychotic while under the influence of ...
32                                         4 binges/year
158                                          broken ribs
166                                  disclocated sholder
140                                                  DUI
226                                          pain relief
259                              no alcohol restrictions
Name: KEYWORDS, dtype: object

Now, if we’d rather extract the numerical vectors from Faiss, we can do that too.

In [None]:
# we have k vectors to return (k) - so we initialize a zero array to hold them
vectors = np.zeros((k, d))
for i, val in enumerate(I[0].tolist()):
  vectors[i, :] = index.reconstruct(val)

vectors.shape

(10, 768)

In [None]:
I.tolist()

[[262, 144, 177, 54, 32, 158, 166, 140, 226, 259]]

##Generic keyword search

https://www.pinecone.io/learn/faiss-tutorial/

In [None]:
def make_keyword_index(keyword_embeddings):
  d = keyword_embeddings.shape[1]

  # Let's define the index and add keywords to it
  index = faiss.IndexFlatL2(d)
  index.add(keyword_embeddings)

  return index

keyword_index = make_keyword_index(keyword_embeddings)

In [None]:
def search(query, keyword_index):
  t=time.time()
  query_vector = model.encode([query])
  k = 2
  D, I = keyword_index.search(query_vector, k)
  #print('totaltime: {}'.format(time.time()-t))
  return I.tolist()[0]

In [None]:
len(ocr_sample["OCR_CONTENT"][0])

3696

In [None]:
sentences_list[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
# performing the search
result_list = []
content_list = []
for sentence in contents:
  if  sentence not in ["\r", " \r"]: 
    print(f"sentence: {sentence}")
    results=search(sentence, keyword_index)
    print(f'results :')
    for result in keyword_df["KEYWORDS"].iloc[results]:
      print('\t', result)
      result_list.append(result)
    content_list.append(sentence)

sentence:  TEMPORARY INSURANCE AGREEMENT (TIA)Quali?cationsTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of thequestions on the Temporary Insurance Agreement
results :
	 Blood
	 social use of alcohol
sentence: Limits- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products
results :
	 advised to limit alcohol consumption
	 under the influence
sentence: - For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001Initial PremiumThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater ofone modal premium or three times the monthly initial minimum premium illustrated
results :
	 Blood
	 should not drink
sentence: Submitting- It is preferred that the check for the TIA, appli

In [None]:
content_list[:10]

['TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r',
 'Initial Premium\r',
 'The minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater of\r',
 'one modal premium or three times the monthly initial minimum premium illustrated.\r']

In [None]:
# performing the search
results=search(ocr_sample["OCR_CONTENT"][0])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.019058704376220703
results :
	 beckham leads as man u cut down depleted juve
	 flight attendant action wont affect bundaberg
	 french expedition site to be heritage protected
	 opposition urged to help protect recherche bay
	 canegrowers hope for late summer rain


In [None]:
results=search(ocr_sample["OCR_CONTENT"][1])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017243385314941406
results :
	 opposition urged to help protect recherche bay
	 orientation begins for uni students
	 onesteel to invest 80m in whyalla steelworks
	 brigadier dismisses reports troops harassed in
	 flight attendant action wont affect bundaberg


In [None]:
results=search(ocr_sample["OCR_CONTENT"][2])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017183780670166016
results :
	 opposition urged to help protect recherche bay
	 beckham leads as man u cut down depleted juve
	 orientation begins for uni students
	 mp rejects ambulance levy claims
	 last minute call hands alinghi big lead


In [None]:
results=search(ocr_sample["OCR_CONTENT"][3])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.021990537643432617
results :
	 flight attendant action wont affect bundaberg
	 last minute call hands alinghi big lead
	 french expedition site to be heritage protected
	 health minister backs organ and tissue storage
	 blizzard buries united states in bills


In [None]:
results=search(ocr_sample["OCR_CONTENT"][4])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.020436525344848633
results :
	 martin to lobby against losing nt seat in fed
	 flight attendant action wont affect bundaberg
	 blizzard buries united states in bills
	 french expedition site to be heritage protected
	 last minute call hands alinghi big lead


In [None]:
query = "American celebrity public speech"
results=search("Each sentence is converted")
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.013369321823120117
results :
	 reading go third in first division
	 aussie qualifier stosur wastes four memphis match
	 police defend aboriginal tent embassy raid
	 refshauge wins defamation court case
	 code of conduct toughens organ donation regulations
