<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/07_icd_code_matching_using_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2

Just restart the colab environment.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk

##PDF text extraction

In [2]:
!mkdir pdf-files
!mkdir txt-files

In [3]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/pdf-page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"pdf-page-{page}.pdf")
  return pdf_list

In [4]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/pdf-page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/pdf-page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [7]:
pdf_list = split_pdf("Redacted_Sample_2.pdf")

In [8]:
txt_list = extract_text_from_pdf(pdf_list)

In [10]:
txt_list[:10]

['txt-files/pdf-page-0.txt',
 'txt-files/pdf-page-1.txt',
 'txt-files/pdf-page-2.txt',
 'txt-files/pdf-page-3.txt',
 'txt-files/pdf-page-4.txt',
 'txt-files/pdf-page-5.txt',
 'txt-files/pdf-page-6.txt',
 'txt-files/pdf-page-7.txt',
 'txt-files/pdf-page-8.txt',
 'txt-files/pdf-page-9.txt']

In [9]:
txt_list[:10]

['txt-files/pdf-page-0.txt',
 'txt-files/pdf-page-1.txt',
 'txt-files/pdf-page-2.txt',
 'txt-files/pdf-page-3.txt',
 'txt-files/pdf-page-4.txt',
 'txt-files/pdf-page-5.txt',
 'txt-files/pdf-page-6.txt',
 'txt-files/pdf-page-7.txt',
 'txt-files/pdf-page-8.txt',
 'txt-files/pdf-page-9.txt']

##Spacy entity rule-matcher

In [12]:
nlp = spacy.load("en_core_web_sm")
nlp = English()
ruler = nlp.add_pipe("entity_ruler")

In [11]:
icd_code_df = pd.read_csv("icd_10_codes.csv")
icd_code_df.head()

Unnamed: 0,ICD-10
0,A00.0
1,A00.1
2,A00.9
3,A01.00
4,A01.01


In [5]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():
    patterns.append({"label": "ICD-10", "pattern": row["ICD-10"]})
  return patterns

In [15]:
patterns = make_icd_10_code_pattern(icd_code_df)
patterns[:10]

[{'label': 'ICD-10', 'pattern': 'A00.0'},
 {'label': 'ICD-10', 'pattern': 'A00.1'},
 {'label': 'ICD-10', 'pattern': 'A00.9'},
 {'label': 'ICD-10', 'pattern': 'A01.00'},
 {'label': 'ICD-10', 'pattern': 'A01.01'},
 {'label': 'ICD-10', 'pattern': 'A01.02'},
 {'label': 'ICD-10', 'pattern': 'A01.03'},
 {'label': 'ICD-10', 'pattern': 'A01.04'},
 {'label': 'ICD-10', 'pattern': 'A01.05'},
 {'label': 'ICD-10', 'pattern': 'A01.09'}]

In [16]:
ruler.add_patterns(patterns)

In [None]:
icd_code_df.loc[icd_code_df["ICD-10" == "Z00.0"]]

In [None]:
with open(f"{txt_files_path}/pdf-page-102.txt", "r") as f:
  one_txt = f.read()
  print(one_txt)
  doc = nlp(one_txt)
  print([(ent.text, ent.label_) for ent in doc.ents])

##Performance Testing

In [8]:
my_txt = "txt-files/pdf-page-0.txt"
my_txt = my_txt.split("/")[1]
my_txt

'pdf-page-0.txt'

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp = English()
ruler = nlp.add_pipe("entity_ruler")

In [None]:
%%time

# Step-1: spliting pdf file
pdf_list = split_pdf("Redacted_Sample_2.pdf")

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: Preparing Spacy's patterns
patterns = make_icd_10_code_pattern(pd.read_csv("icd_10_codes.csv"))

# Step-4: adding patterns to Spacy
ruler.add_patterns(patterns)

# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    print(f"Page[{txt_file.split('/')[1]}]: ", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
found_page = {}
code_list = []

# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())

    code_found = False
    for ent in doc.ents:
      if ent is not None:
        print(ent.text)
        code_found = True
        code_list.append(ent.text)
    if code_found:
      found_page[f"Page[{txt_file.split('/')[1]}]"] = code_list

In [47]:
True if re.search("L[0-9{2,}]", "L14") else False

True

In [38]:
True if re.search("B[IAU]G", "BUG") else False

True

In [32]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    code_list = [ent.text for ent in doc.ents]
    if len(code_list) != 0:
      print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])

Page[pdf-page-2.txt]: ['L14', 'L14']
Page[pdf-page-3.txt]: ['L14', 'L14', 'L22', 'L22', 'L14', 'L14', 'L22', 'L22']
Page[pdf-page-4.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14', 'L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14']
Page[pdf-page-6.txt]: ['L14', 'L14', 'L14', 'L14']
Page[pdf-page-7.txt]: ['L14', 'L14', 'L14', 'L14', 'L14', 'L14', 'L14', 'L14']
Page[pdf-page-8.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L26']
Page[pdf-page-9.txt]: ['L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22', 'L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22']
Page[pdf-page-10.txt]: ['L22', 'L26', 'L14', 'L22', 'L26', 'L14']
Page[pdf-page-11.txt]: ['L22', 'L22', 'L22', 'L22', 'L22', 'L22']
Page[pdf-page-12.txt]: ['L14', 'L14']
Page[pdf-page-18.txt]: ['M54.40', 'M54.40']
Page[pdf-page-19.txt]: ['G43.109', 'G43.109']
Page[pdf-page-26.txt]: ['D17.1', 'F43.9', 'D17.1', 'F43.9']
Page[pdf-page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2

In [14]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    print(f"Page[{txt_file.split('/')[1]}]: ", [(ent.text, ent.label_) for ent in doc.ents if ent is not None])
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])
    for ent in doc.ents:
      if ent is not None:
        print(ent.text)

Page[pdf-page-0.txt]:  []
Page[pdf-page-1.txt]:  []
Page[pdf-page-2.txt]:  [('L14', 'ICD-10'), ('L14', 'ICD-10')]
Page[pdf-page-3.txt]:  [('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L22', 'ICD-10'), ('L22', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L22', 'ICD-10'), ('L22', 'ICD-10')]
Page[pdf-page-4.txt]:  [('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L22', 'ICD-10'), ('L14', 'ICD-10'), ('L22', 'ICD-10'), ('L14', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L26', 'ICD-10'), ('L22', 'ICD-10'), ('L14', 'ICD-10'), ('L22', 'ICD-10'), ('L14', 'ICD-10')]
Page[pdf-page-5.txt]:  []
Page[pdf-page-6.txt]:  [('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10')]
Page[pdf-page-7.txt]:  [('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10'), ('L14', 'ICD-10')]
Page[pdf-page-8.txt]:  [('L26', 'ICD-10'),

##Loading data

In [None]:
keyword_df = pd.read_csv("keywords.csv")
keyword_df.head()

Unnamed: 0,CATEGORY0,CATEGORY1,CATEGORY2,SYNID,KEYWORDS
0,Keywords,Respiratory,,KW-rESPIRATORY01,Respiratory
1,Keywords,Alcohol,Alcohol,KW-ALCOHOL184,struggling with alcohol
2,Keywords,Alcohol,Alcohol,KW-ALCOHOL185,suspected alcohol abuse
3,Keywords,Alcohol,Alcohol,KW-ALCOHOL186,taking medication for alcohol
4,Keywords,Alcohol,Alcohol,KW-ALCOHOL187,tequilla


In [None]:
len(set(keyword_df["KEYWORDS"].to_list()))

284

In [None]:
# remove duplicates and NaN
keywords = [word for word in list(set(keyword_df["KEYWORDS"].to_list())) if type(word) is str]

In [None]:
ocr_sample = pd.read_csv("ocr_page.csv", encoding="cp1252")
ocr_sample.head()

Unnamed: 0,PAGEID,OCR_CONTENT,LOAD_TIMESTAMP
0,104707,PACIFIC LIFE INSURANCE COMPANY\n\nLife Insuran...,26-JAN-22 07.04.57.000000000 PM
1,104708,\r\n\r\nTEMPORARY INSURANCE AGREEMENT (TIA)\r...,26-JAN-22 07.04.57.000000000 PM
2,104709,PACIFIC LIFE INSURANCE COMPANY\r\n\r\nLife Ins...,26-JAN-22 07.04.57.000000000 PM
3,104710,\n\nNON-FORFEITURE PREMIUM CESSATION OPTIONS ...,26-JAN-22 07.04.57.000000000 PM
4,104711,\n\nADDITIONAL POLICYOWNER (If more than one ...,26-JAN-22 07.04.57.000000000 PM


##Preparing query content

In [None]:
ocr_sample["OCR_CONTENT"][1].split("\n")[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
# converting text to sentences
sentences_list = []
sentences = ocr_sample["OCR_CONTENT"][1].split("\n")
for sentence in sentences:
  if sentence != "":
    sentences_list.append(sentence)
sentences_list[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
contents = ocr_sample["OCR_CONTENT"][1].replace("\r\n", "").split(".")
contents

[' TEMPORARY INSURANCE AGREEMENT (TIA)Quali?cationsTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of thequestions on the Temporary Insurance Agreement',
 'Limits- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001Initial PremiumThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater ofone modal premium or three times the monthly initial minimum premium illustrated',
 'Submitting- It is preferred that the check for the TIA, application, and TIA form all have the same date',
 '- If it is not possible to obtain a check at the time the application is completed, the check may be dated up to 3 days later th

##Encoding keywords

In [None]:
# Loading the pre-trained model 
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
# create keywords embeddings
keyword_embeddings = model.encode(keywords)

In [None]:
keyword_embeddings.shape

(284, 768)

In [None]:
len(keyword_embeddings[0])

768

##Create FAISS index

In [None]:
d = keyword_embeddings.shape[1]
d

768

In [None]:
# Let's define the index and add keywords to it
index = faiss.IndexFlatL2(d)
index.is_trained

True

In [None]:
index.add(keyword_embeddings)

In [None]:
index.ntotal

284

##Search keyword

Then search given a query and number of nearest neigbors to return k.

In [None]:
k = 10
xq = model.encode(["taking medication for alcohol"])

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[262 144 177  54  32 158 166 140 226 259]]
CPU times: user 823 µs, sys: 804 µs, total: 1.63 ms
Wall time: 1.11 ms


In [None]:
I.tolist()[0]

[262, 144, 177, 54, 32, 158, 166, 140, 226, 259]

In [None]:
keyword_df["KEYWORDS"].iloc[I.tolist()[0]]

262                                    no use of alcohol
144                                    excessive alcohol
177                                    hairline fracture
54     acting psychotic while under the influence of ...
32                                         4 binges/year
158                                          broken ribs
166                                  disclocated sholder
140                                                  DUI
226                                          pain relief
259                              no alcohol restrictions
Name: KEYWORDS, dtype: object

Now, if we’d rather extract the numerical vectors from Faiss, we can do that too.

In [None]:
# we have k vectors to return (k) - so we initialize a zero array to hold them
vectors = np.zeros((k, d))
for i, val in enumerate(I[0].tolist()):
  vectors[i, :] = index.reconstruct(val)

vectors.shape

(10, 768)

In [None]:
I.tolist()

[[262, 144, 177, 54, 32, 158, 166, 140, 226, 259]]

##Generic keyword search

https://www.pinecone.io/learn/faiss-tutorial/

In [None]:
def make_keyword_index(keyword_embeddings):
  d = keyword_embeddings.shape[1]

  # Let's define the index and add keywords to it
  index = faiss.IndexFlatL2(d)
  index.add(keyword_embeddings)

  return index

keyword_index = make_keyword_index(keyword_embeddings)

In [None]:
def search(query, keyword_index):
  t=time.time()
  query_vector = model.encode([query])
  k = 2
  D, I = keyword_index.search(query_vector, k)
  #print('totaltime: {}'.format(time.time()-t))
  return I.tolist()[0]

In [None]:
len(ocr_sample["OCR_CONTENT"][0])

3696

In [None]:
sentences_list[:10]

[' \r',
 '\r',
 'TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 '\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r']

In [None]:
# performing the search
result_list = []
content_list = []
for sentence in contents:
  if  sentence not in ["\r", " \r"]: 
    print(f"sentence: {sentence}")
    results=search(sentence, keyword_index)
    print(f'results :')
    for result in keyword_df["KEYWORDS"].iloc[results]:
      print('\t', result)
      result_list.append(result)
    content_list.append(sentence)

sentence:  TEMPORARY INSURANCE AGREEMENT (TIA)Quali?cationsTo qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of thequestions on the Temporary Insurance Agreement
results :
	 Blood
	 social use of alcohol
sentence: Limits- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products
results :
	 advised to limit alcohol consumption
	 under the influence
sentence: - For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001Initial PremiumThe minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater ofone modal premium or three times the monthly initial minimum premium illustrated
results :
	 Blood
	 should not drink
sentence: Submitting- It is preferred that the check for the TIA, appli

In [None]:
content_list[:10]

['TEMPORARY INSURANCE AGREEMENT (TIA)\r',
 'Quali?cations\r',
 'To qualify for temporary alcoholic hepatitis, the Proposed Insured(s) must be over 15 days of age or under age 70, and able to answer “No" to all of the\r',
 'questions on the Temporary Insurance Agreement.\r',
 'Limits\r',
 '- The TIA is limited to $1,000,000 for individual products and $1,500,000 for Second-to-Die products.\r',
 '- For Monthly Bene?t Term Life Insurance only, the TIA is limited to the lesser ofthe Monthly Bene?t applied for or a Monthly Bene?t amount of $5,001\r',
 'Initial Premium\r',
 'The minimum initial premium that can be submitted with the completed application and the TIA is either 1) 10% of the annual premium, or 2) greater of\r',
 'one modal premium or three times the monthly initial minimum premium illustrated.\r']

In [None]:
# performing the search
results=search(ocr_sample["OCR_CONTENT"][0])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.019058704376220703
results :
	 beckham leads as man u cut down depleted juve
	 flight attendant action wont affect bundaberg
	 french expedition site to be heritage protected
	 opposition urged to help protect recherche bay
	 canegrowers hope for late summer rain


In [None]:
results=search(ocr_sample["OCR_CONTENT"][1])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017243385314941406
results :
	 opposition urged to help protect recherche bay
	 orientation begins for uni students
	 onesteel to invest 80m in whyalla steelworks
	 brigadier dismisses reports troops harassed in
	 flight attendant action wont affect bundaberg


In [None]:
results=search(ocr_sample["OCR_CONTENT"][2])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.017183780670166016
results :
	 opposition urged to help protect recherche bay
	 beckham leads as man u cut down depleted juve
	 orientation begins for uni students
	 mp rejects ambulance levy claims
	 last minute call hands alinghi big lead


In [None]:
results=search(ocr_sample["OCR_CONTENT"][3])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.021990537643432617
results :
	 flight attendant action wont affect bundaberg
	 last minute call hands alinghi big lead
	 french expedition site to be heritage protected
	 health minister backs organ and tissue storage
	 blizzard buries united states in bills


In [None]:
results=search(ocr_sample["OCR_CONTENT"][4])
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.020436525344848633
results :
	 martin to lobby against losing nt seat in fed
	 flight attendant action wont affect bundaberg
	 blizzard buries united states in bills
	 french expedition site to be heritage protected
	 last minute call hands alinghi big lead


In [None]:
query = "American celebrity public speech"
results=search("Each sentence is converted")
print('results :')
for result in results:
   print('\t', result)

totaltime: 0.013369321823120117
results :
	 reading go third in first division
	 aussie qualifier stosur wastes four memphis match
	 police defend aboriginal tent embassy raid
	 refshauge wins defamation court case
	 code of conduct toughens organ donation regulations
