<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/13_code_keyword_correct_and_highlight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [1]:
import pandas as pd
import re
import os
import glob
import difflib
import pickle

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

from keyword_extractor import *
from concurrent import futures

In [4]:
!mkdir -p pdf-files
!mkdir -p txt-files
!mkdir -p ocr-pdf-files

In [2]:
# define directory path after creating it
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"
ocr_pdf_files_path = "ocr-pdf-files"

MAX_WORKERS = 20

In [19]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
        with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
            pdf = pdftotext.PDF(f)

        # Read all the text into one string
        pdf_text = "\n\n".join(pdf)

        # write text into file
        with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
            f.write(pdf_text)
        txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
        i += 1
    return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def isExactMatch(page, term, clip, fullMatch=False, caseSensitive=False):
  # clip is an item from page.search_for(term, quads=True)
  termLen = len(term)
  termBboxLen = max(clip.height, clip.width)
  termfontSize = termBboxLen/termLen
  f = termfontSize*2

  #clip = clip.rect

  validate = page.get_text("blocks", clip = clip + (-f, -f, f, f), flags=0)[0][4]
  flag = 0
  if not caseSensitive:
      flag = re.IGNORECASE

  matches = len(re.findall(f'{term}', validate, flags=flag)) > 0
  if fullMatch:
      matches = len(re.findall(f'\\b{term}\\b', validate))>0
  return matches


def highlight_icd_code_and_keyword(icd10_code_dict, icd9_code_dict, page_keyword_dict=None, pdf_file_name=None, cords_file_name=None):
  pdf_file = fitz.open(pdf_file_name)

  def highlight_pdf(highlight, icd10_code, code_type):
    cords_list = []
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      if code_type == "ICD-9":
        highlight.set_colors(stroke=[1, 0.5, 0.8]) # light red color (r, g, b)
      highlight.update()
      highlight = page.search_for(icd10_code)
      cords_list.append(highlight)
    code_cors_output = f"Page-{page_num}: {icd10_code} : {cords_list}"
    txt_output_file_name.write("%s\n" % code_cors_output)

  # create file to write cordinate 
  txt_output_file_name = open(cords_file_name, "a")

  for page_num, page in enumerate(pdf_file):

    # highlight ICD-10 code
    if page_num in icd10_code_dict:
      for code in icd10_code_dict[page_num]:
        highlight = page.search_for(code)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            highlight = page.search_for(alt_code)
            # highlight pdf for option pattern
            highlight_pdf(highlight, alt_code, code_type="ICD-10")
        # highlight pdf for main pattern   
        highlight_pdf(highlight, code, code_type="ICD-10")

    # highlight ICD-9 code
    if page_num in icd9_code_dict:
      for code in icd9_code_dict[page_num]:
        highlight = page.search_for(code)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            highlight = page.search_for(alt_code)
            # highlight pdf for option pattern
            highlight_pdf(highlight, alt_code, code_type="ICD-9")
        # highlight pdf for main pattern   
        highlight_pdf(highlight, code, code_type="ICD-9")

    # highlight ICD-10 keyword
    if page_keyword_dict is not None:
      if page_num in page_keyword_dict:
        for keyword in page_keyword_dict[page_num]:
          coordinates = page.search_for(keyword)
          #print(f"Keyword: {keyword}, Length: {len(coordinates)}")
          cords_list = []
          for inst in coordinates:
            #print(f"Keyword: {keyword}, inst: {inst}")
            #if isExactMatch(page, keyword, inst, fullMatch=True, caseSensitive=False):
            highlight = page.add_highlight_annot(inst)
            highlight.set_colors(stroke=[1, 0.8, 0.8])
            highlight.update()
            highlight = page.search_for(keyword)
            cords_list.append(highlight)
          keyword_cors_output = f"Page-{page_num}: {keyword} : {cords_list}"
          txt_output_file_name.write("%s\n" % keyword_cors_output)
          #print(f"Page-{page_num}: ", highlight, end='\n')

  txt_output_file_name.close()

  pdf_output_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

  return pdf_output_file_name, cords_file_name


def filter_unwanted_code(code_list, page_text):
    filtered_code_list = []
    # if re.search("ICD", page_text):
    # match_list = re.findall("(ICD-[0-9][a-zA-z]*\-.+)[ ]", page_text)
    match_list = re.findall("(IC[(A-z)]-[0-9][a-zA-z]*\-.+)[ ]", page_text)
    # print("Match list:\n", match_list)
    for found_code in match_list:
        for code in code_list:
            if code in found_code:
                filtered_code_list.append(code)
    return filtered_code_list


def search_icd_code(txt_list, nlp, code_type):
    pdf_page_vocab = {}
    for txt_file in txt_list:
        with open(txt_file, "r") as f:
            page_txt = f.read()
            # filter the page that have line number instead of code
            if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
                doc = nlp(page_txt)
                code_list = [ent.text for ent in doc.ents]
                page_number = 0
                if len(code_list) != 0:
                    page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
                    pdf_page_vocab[page_number] = code_list
                    # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")

                # filter the page that dont have ICD string into it
                if code_type == "ICD-9":
                    filtered_code_list = filter_unwanted_code(code_list, page_txt)
                    pdf_page_vocab[page_number] = filtered_code_list
                    # print(f"Page[{txt_file.split('/')[1]}]: {filtered_code_list}")

    return pdf_page_vocab


def get_json_array_list(text_path):
  print(f"Running '{text_path}'")
  json_arr = call(text_path)
  print(f"Got json for '{text_path}'")
  return json_arr


def get_wrong_keyword_dict_with_thread(text_path_list):
  wrong_keyword_dict = {}

  # take care so that unnecessary thread should not be created
  workers = min(MAX_WORKERS, len(text_path_list))
  with futures.ThreadPoolExecutor(max_workers=workers) as executor:
    json_arr_list = executor.map(get_json_array_list, sorted(text_path_list))

  for idx, json_arr in enumerate(json_arr_list):
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list: 
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

def get_wrong_keyword_dict_with_process(text_path_list):
  wrong_keyword_dict = {}

  # take care so that unnecessary thread should not be created
  workers = min(MAX_WORKERS, len(text_path_list))
  with futures.ProcessPoolExecutor(max_workers=4) as executor:
    json_arr_list = executor.map(get_json_array_list, sorted(text_path_list))

  for idx, json_arr in enumerate(json_arr_list):
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list: 
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))


def get_wrong_keyword_dict(text_path_list):
  wrong_keyword_dict = {}
  for idx, file_path in enumerate(text_path_list):
    print(idx, file_path)
    json_arr = get_json_array_list(file_path)
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list:
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return wrong_keyword_dict


def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [4]:
# Step-0: Load prerequisite instance
# create nlp instance
nlp_keyword = spacy.load('en_core_web_sm')

# loading and updating patterns for ICD-10 code
nlp_code10 = English()
nlp_code10.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v3.jsonl")

# loading and updating patterns for ICD-9 code
nlp_code9 = English()
nlp_code9.add_pipe("entity_ruler").from_disk("./icd9_code_patterns-v1.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7f80a495adc0>

In [21]:
for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_name = f"{ocr_pdf_files_path}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = search_icd_code(txt_list, nlp_code10, code_type="ICD-10")

  # Step-4: Searching ICD-9 code
  page_code9_dict = search_icd_code(txt_list, nlp_code9, code_type="ICD-9")

  # Step-5: Searching ICD-10 keyword
  wrong_keyword_dict = get_wrong_keyword_dict(txt_list)
  # wrong_keyword_dict = get_wrong_keyword_dict_with_thread(txt_list)
  # wrong_keyword_dict = get_wrong_keyword_dict_with_process(txt_list)
  # print("After sorting:\n", wrong_keyword_dict)

  # Step-6: Highlighting ICD-10 code and keyword into pdf
  pdf_output_file, txt_output_file = highlight_icd_code_and_keyword(icd10_code_dict, page_code9_dict, wrong_keyword_dict,
                                                                    pdf_file_name, cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and keyword")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

0 txt-files/page-0.txt
Running 'txt-files/page-0.txt'
Got json for 'txt-files/page-0.txt'
1 txt-files/page-1.txt
Running 'txt-files/page-1.txt'
Got json for 'txt-files/page-1.txt'
2 txt-files/page-2.txt
Running 'txt-files/page-2.txt'
Got json for 'txt-files/page-2.txt'
3 txt-files/page-3.txt
Running 'txt-files/page-3.txt'
Got json for 'txt-files/page-3.txt'
4 txt-files/page-4.txt
Running 'txt-files/page-4.txt'
Got json for 'txt-files/page-4.txt'
File[ocr-pdf-files/9929_final_output.pdf] is saved after highlighting ICD-10 code and keyword
Highlighted coordinates are saved into [ocr-pdf-files/9929_final_cords.txt] file.


In [20]:
!rm -rf ocr-pdf-files
!mkdir -p ocr-pdf-files

In [27]:
purge("ocr-pdf-files/*.txt")
purge("ocr-pdf-files/*_output.pdf")
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [None]:
!zip output.zip ocr-pdf-files/*_output_cords.txt ocr-pdf-files/*_output_output.pdf