<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/13_icd_10_code_and_keyword_exact_match_highliting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

Just restart the colab environment.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import fitz
import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk

In [2]:
!mkdir pdf-files
!mkdir txt-files

In [3]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

##Define some functions

In [4]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list

In [5]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [6]:
def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code

In [7]:
def highlight_icd10_code_and_keyword(pdf_code_dict, page_keyword_dict, pdf_file_name):
  pdf_file = fitz.open(pdf_file_name)

  def highlight_pdf(highlight, icd10_code):
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      highlight.update()
      highlight = page.search_for(icd10_code)
      if len(highlight) > 0:
        code_cors_output = f"Page-{page_num}: {icd10_code} : {highlight}"
        txt_output_file_name.write("%s\n" % code_cors_output)

  # create file to write cordinate 
  txt_file_name = f"{pdf_file_name.split('.')[0]}_cords.txt"
  txt_output_file_name = open(txt_file_name, "w")

  for page_num, page in enumerate(pdf_file):

    # highlight code
    if page_num in pdf_code_dict:
      for code in pdf_code_dict[page_num]:
        highlight = page.search_for(code)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            highlight = page.search_for(alt_code)
            # highlight pdf for option pattern
            highlight_pdf(highlight, alt_code)
        # highlight pdf for main pattern   
        highlight_pdf(highlight, code)

    # highlight keyword
    if page_num in page_keyword_dict:
      for keyword in page_keyword_dict[page_num]:
        highlight = page.search_for(keyword)
        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.set_colors(stroke=[1, 0.8, 0.8])
          highlight.update()
          highlight = page.search_for(keyword)
          keyword_cors_output = f"Page-{page_num}: {keyword} : {highlight}"
          txt_output_file_name.write("%s\n" % keyword_cors_output)
          #print(f"Page-{page_num}: ", highlight, end='\n')

  txt_output_file_name.close()
  pdf_output_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)
  return pdf_output_file_name, txt_file_name

In [8]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():
    patterns.append({"label": "ICD-10", "pattern": row["ICD-10"]})
  return patterns

In [9]:
def make_icd_10_keyword_pattern(icd_10_keyword_df, nlp=None):
  keywords = []
  for _, row in icd_10_keyword_df.iterrows():
    keywords.append(row["Keyword"])
  
  phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
  patterns = list(nlp.tokenizer.pipe(keywords))
  phrase_matcher.add('keywords', patterns)
  return phrase_matcher

In [10]:
def search_icd_10_code(txt_list, nlp=None):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [11]:
def search_icd_10_keyword(txt_list, phrase_matcher, nlp=None):
  page_keyword_dict = {}
  # Step-4: Searching ICD-10 code
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      doc = nlp(page_txt)
      matches = phrase_matcher(doc)

      keyword_list = []
      for match_id, start, end in matches:
        span = doc[start: end]
        keyword_list.append(f"{span}")

      if len(keyword_list) != 0:
        page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
        page_keyword_dict[page_number] = set(keyword_list)
        # print(f"Page[{txt_file.split('/')[1]}]: {set(keyword_list)}")
  return page_keyword_dict

##Data preprocessing

In [None]:
columns = []
with open("icd_10_keywords.txt", "r") as f:
  txt_lines = f.readlines()
  for line in txt_lines:
    columns.append(line.strip("\n"))
print(columns[:10])

['Cholera due to Vibrio cholerae 01, biovar cholerae', 'Cholera due to Vibrio cholerae 01, biovar eltor', 'Cholera, unspecified', 'Typhoid fever, unspecified', 'Typhoid meningitis', 'Typhoid fever with heart involvement', 'Typhoid pneumonia', 'Typhoid arthritis', 'Typhoid osteomyelitis', 'Typhoid fever with other complications']


In [None]:
data_keyword_df = pd.DataFrame(columns, columns=["Keyword"])
data_keyword_df.head()

Unnamed: 0,Keyword
0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,"Cholera, unspecified"
3,"Typhoid fever, unspecified"
4,Typhoid meningitis


In [None]:
data_keyword_df.to_csv("icd_10_keywords.csv", index=False)

In [None]:
data_code_df = pd.read_csv("icd_10_codes.csv")
data_code_df.head()

Unnamed: 0,ICD-10
0,A00.0
1,A00.1
2,A00.9
3,A01.00
4,A01.01


In [None]:
data_code_df["ICD-10"].head()

0     A00.0
1     A00.1
2     A00.9
3    A01.00
4    A01.01
Name: ICD-10, dtype: object

In [None]:
data_keyword_df["Keyword"].head()

0    Cholera due to Vibrio cholerae 01, biovar chol...
1      Cholera due to Vibrio cholerae 01, biovar eltor
2                                 Cholera, unspecified
3                           Typhoid fever, unspecified
4                                   Typhoid meningitis
Name: Keyword, dtype: object

In [None]:
icd_code_kerword_df = pd.DataFrame().assign(Code=data_code_df['ICD-10'], Keyword=data_keyword_df['Keyword'])
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


In [None]:
icd_code_kerword_df.to_csv("icd_10_code_keywords.csv", index=False)

In [None]:
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


##All Steps Together

In [None]:
# Step-1: spliting pdf file
pdf_file_name = "Redacted_Sample.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

In [None]:
nlp_code = English()
# Step-3: loading and updating patterns to Spacy
nlp_code.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7f2ec57080f0>

In [None]:
# Step-4: Searching ICD-10 code
page_code_dict = search_icd_10_code(txt_list, nlp_code)

Page[page-18.txt]: ['M54.40']
Page[page-19.txt]: ['G43.109']
Page[page-26.txt]: ['D17.1', 'F43.9']
Page[page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[page-74.txt]: ['M54.2', 'R42']
Page[page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[page-84.txt]: ['G43.109']
Page[page-85.txt]: ['L25.9', 'D72.819']
Page[page-102.txt]: ['G43.109', 'R00.2', 'L25.9', 'Z00.00']
Page[page-105.txt]: ['G43.109', 'R00.2', 'L25.9']


In [None]:
# Step-5: Creating ICD-10 keyword pattern
nlp_keyword = spacy.load('en_core_web_sm')
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
phrase_matcher = make_icd_10_keyword_pattern(icd_code_kerword_df, nlp_keyword)

In [None]:
# Step-6: Searching ICD-10 keyword
page_keyword_dict = search_icd_10_keyword(txt_list, phrase_matcher, nlp_keyword)

Page[page-0.txt]: {'Headache', 'Palpitations', 'Anxiety Disorder', 'Alcohol Use', 'Cerebrovascular Disease', 'alcohol use', 'Cerebral Infarction', 'Aphasia', 'Spleen', 'Migraine', 'Insomnia'}
Page[page-1.txt]: {'Migraine', 'Insomnia'}
Page[page-2.txt]: {'Lipomatosis', 'Unspecified', 'Spleen', 'chronic', 'Chronic', 'Moderate'}
Page[page-3.txt]: {'depression', 'Palpitations', 'palpitations', 'Radiculopathy', 'Sciatica', 'Low back pain', 'Multiple sites', 'Incomplete'}
Page[page-4.txt]: {'Headache', 'wrestling', 'Tobacco Use', 'Lower Abdominal Pain', 'left upper extremity', 'Alcohol Use', 'Wrestling', 'Unspecified', 'Overweight', 'Cervicogenic Headache', 'Aphasia', 'Alcohol use', 'Lifting', 'Insomnia', 'Migraine', 'Surgical Procedure', 'Constipation'}
Page[page-5.txt]: {'Minimal', 'right ear', 'Anxiety Disorder'}
Page[page-6.txt]: {'headache', 'Headache', 'wrestling', 'migraine', 'Multiple Sclerosis', 'not intractable', 'left upper extremity', 'Acute', 'Occipital Neuralgia', 'Cerebrovascu

In [None]:
# Step-7: Highlighting ICD-10 code and keyword into pdf
output_file_name = highlight_icd10_code_and_keyword(page_code_dict, page_keyword_dict, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code and keyword")

##Highlight Multiple Files

In [12]:
!mkdir synodex_ocr_pdf_files

In [13]:
# Step-0: Load prerequisite instance
nlp_code = English()
nlp_keyword = spacy.load('en_core_web_sm')

# Load icd_10_code_keywords.csv file
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")

# loading and updating patterns to Spacy
nlp_code.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

# Creating ICD-10 keyword pattern
phrase_matcher = make_icd_10_keyword_pattern(icd_code_kerword_df, nlp_keyword)

In [15]:
%%time

ocr_pdf_files_path = "synodex_ocr_pdf_files"
for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_name = f"{ocr_pdf_files_path}/{pdf_file}"
  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  page_code_dict = search_icd_10_code(txt_list, nlp_code)

  # Step-4: Searching ICD-10 keyword
  page_keyword_dict = search_icd_10_keyword(txt_list, phrase_matcher, nlp_keyword)

  # Step-7: Highlighting ICD-10 code and keyword into pdf
  pdf_output, txt_output = highlight_icd10_code_and_keyword(page_code_dict, page_keyword_dict, pdf_file_name)
  print(f"File[{pdf_output}] is saved after highlighting ICD-10 code and keyword")
  print(f"Highlighted coordinates are saved into [{txt_output}] file.")

File[synodex_ocr_pdf_files/Redacted_Sample_output.pdf] is saved after highlighting ICD-10 code and keyword
Highlighted coordinates are saved into [synodex_ocr_pdf_files/Redacted_Sample_cords.txt] file.
File[synodex_ocr_pdf_files/Report_equitable_SampleAPSforSummarization1_Redactedv5_output.pdf] is saved after highlighting ICD-10 code and keyword
Highlighted coordinates are saved into [synodex_ocr_pdf_files/Report_equitable_SampleAPSforSummarization1_Redactedv5_cords.txt] file.
CPU times: user 1min 40s, sys: 1.74 s, total: 1min 42s
Wall time: 1min 44s
