<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/13_code_keyword_sentence_extraction_and_highlight_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
import pandas as pd
import re
import os
import glob
import difflib
import pickle

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

from keyword_extractor import call
from concurrent import futures

In [2]:
!mkdir -p pdf-files
!mkdir -p txt-files
!mkdir -p ocr-pdf-files

In [3]:
# define directory path after creating it
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"
ocr_pdf_files_path = "ocr-pdf-files"

MAX_WORKERS = 20

##Core Functions

In [32]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
        with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
            pdf = pdftotext.PDF(f)

        # Read all the text into one string
        pdf_text = "\n\n".join(pdf)

        # write text into file
        with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
            f.write(pdf_text)
        txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
        i += 1
    return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def isExactMatch(page, term, clip, fullMatch=False, caseSensitive=False):
  # clip is an item from page.search_for(term, quads=True)
  termLen = len(term)
  termBboxLen = max(clip.height, clip.width)
  termfontSize = termBboxLen/termLen
  f = termfontSize*2

  #clip = clip.rect

  validate = page.get_text("blocks", clip = clip + (-f, -f, f, f), flags=0)[0][4]
  flag = 0
  if not caseSensitive:
      flag = re.IGNORECASE

  matches = len(re.findall(f'{term}', validate, flags=flag)) > 0
  if fullMatch:
      matches = len(re.findall(f'\\b{term}\\b', validate))>0
  return matches

def highlight_icd_code_and_keyword(icd10_code_dict, 
                                   icd9_code_dict=None, 
                                   icd_keywords_dict=None, 
                                   pdf_file_name=None, 
                                   cords_file_name=None):
  pdf_file = fitz.open(pdf_file_name)
  already_highlighted_list = []

  def highlight_pdf(highlight, icd10_code, code_type):
    cords_list = []
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      if code_type == "ICD-9":
        highlight.set_colors(stroke=[1, 0.5, 0.8]) # light red color (r, g, b)
      highlight.update()
      highlight = page.search_for(icd10_code)
      cords_list.append(highlight)

    if cords_list:
      num_page = page_num + 1
      code_cors_output = f"Page-{num_page} | {icd10_code}"
      txt_output_file_name.write("%s\n" % code_cors_output)

  # create file to write cordinate 
  txt_output_file_name = open(cords_file_name, "a")

  for page_num, page in enumerate(pdf_file):
    # highlight ICD-10 code
    if page_num in icd10_code_dict:
      for code in icd10_code_dict[page_num]:
        highlight = page.search_for(code)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            highlight = page.search_for(alt_code)
            # highlight pdf for option pattern
            highlight_pdf(highlight, alt_code, code_type="ICD-10")
        # highlight pdf for main pattern   
        highlight_pdf(highlight, code, code_type="ICD-10")

    # highlight ICD-9 code
    if icd9_code_dict is not None:
      if page_num in icd9_code_dict:
        for code in icd9_code_dict[page_num]:
          highlight = page.search_for(code)
          if len(highlight) == 0:
            alternate_code_list = get_opt_pattern(code)
            for alt_code in alternate_code_list:
              highlight = page.search_for(alt_code)
              # highlight pdf for option pattern
              highlight_pdf(highlight, alt_code, code_type="ICD-9")
          # highlight pdf for main pattern   
          highlight_pdf(highlight, code, code_type="ICD-9")

    # highlight ICD key phrase
    if page_num in icd_keywords_dict:
      icd_keyword_dict = icd_keywords_dict[page_num]
      for key_phrase, key_phrase_sents in icd_keyword_dict.items():

        # do not do anything if already highlited
        #if key_phrase_sents[0] in already_highlighted_list:
        #  continue
        #already_highlighted_list.append(key_phrase_sents[0])

        cords_list = []
        keyword_cors_output = ""
        for key_phrase_sent in key_phrase_sents:
          coordinates = page.search_for(key_phrase_sent)
          #print(f"Keyword: {keyword}, Length: {len(coordinates)}")
          
          for inst in coordinates:
            #print(f"Keyword: {keyword}, inst: {inst}")
            # if isExactMatch(page, key_phrase, inst, fullMatch=True, caseSensitive=True):
            highlight = page.add_highlight_annot(inst)
            highlight.set_colors(stroke=[1, 0.8, 0.8])
            highlight.update()
            highlight = page.search_for(key_phrase_sent)
            cords_list.append(highlight)
            num_page = page_num + 1
            keyword_cors_output = f"Page-{num_page} | {key_phrase} | {key_phrase_sent}"

        if cords_list:
          txt_output_file_name.write("%s\n" % keyword_cors_output)
          #print(f"Page-{page_num}: ", highlight, end='\n')

  txt_output_file_name.close()

  pdf_output_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

  return pdf_output_file_name, cords_file_name


def filter_unwanted_code(code_list, page_text):
    filtered_code_list = []
    # if re.search("ICD", page_text):
    # match_list = re.findall("(ICD-[0-9][a-zA-z]*\-.+)[ ]", page_text)
    match_list = re.findall("(IC[(A-z)]-[0-9][a-zA-z]*\-.+)[ ]", page_text)
    # print("Match list:\n", match_list)
    for found_code in match_list:
        for code in code_list:
            if code in found_code:
                filtered_code_list.append(code)
    return filtered_code_list


def search_icd_code(txt_list, nlp, code_type):
    pdf_page_vocab = {}
    for txt_file in txt_list:
        with open(txt_file, "r") as f:
            page_txt = f.read()
            # filter the page that have line number instead of code
            if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
                doc = nlp(page_txt)
                code_list = [ent.text for ent in doc.ents]
                page_number = 0
                if len(code_list) != 0:
                    page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
                    pdf_page_vocab[page_number] = code_list
                    # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")

                # filter the page that dont have ICD string into it
                if code_type == "ICD-9":
                    filtered_code_list = filter_unwanted_code(code_list, page_txt)
                    pdf_page_vocab[page_number] = filtered_code_list
                    # print(f"Page[{txt_file.split('/')[1]}]: {filtered_code_list}")

    return pdf_page_vocab


def get_json_array_list(text_path):
  json_arr = None
  try:
    #print(f"Running '{text_path}'")
    json_arr = call(text_path)
    #print(f"Got json for '{text_path}'")
  except Exception as err:
    print(f"Error for file[{text_path}] is:\n{err}")
  return json_arr


def get_wrong_keyword_dict_with_thread(text_path_list):
  wrong_keyword_dict = {}

  # take care so that unnecessary thread should not be created
  workers = min(MAX_WORKERS, len(text_path_list))
  with futures.ThreadPoolExecutor(max_workers=workers) as executor:
    json_arr_list = executor.map(get_json_array_list, sorted(text_path_list))

  for idx, json_arr in enumerate(json_arr_list):
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list: 
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

def get_wrong_keyword_dict_with_process(text_path_list):
  wrong_keyword_dict = {}

  # take care so that unnecessary thread should not be created
  workers = min(MAX_WORKERS, len(text_path_list))
  with futures.ProcessPoolExecutor(max_workers=4) as executor:
    json_arr_list = executor.map(get_json_array_list, sorted(text_path_list))

  for idx, json_arr in enumerate(json_arr_list):
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list: 
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))


def get_wrong_keyword_dict(text_path_list):
  wrong_keyword_dict = {}
  for idx, file_path in enumerate(text_path_list):
    #print(idx, file_path)
    json_arr = get_json_array_list(file_path)
    if json_arr is not None:
      wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list:
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return wrong_keyword_dict


def extract_sentence2(wrong_kerword_list, sample_text_list):
  match_dicts = {}
  for key, kerword_set in wrong_kerword_list.items():
    for key_phrase in kerword_set:
      #print(key, key_phrase)
      with open(sample_text_list[key], "r") as f:
        file_txt = f.read()
      match_list = re.findall(f"([^\n]*?(?i){key_phrase}[^.]*\.)", file_txt)
      if match_list:
        match_dicts[key_phrase] = [match.replace("\n", "") for match in match_list]
  return match_dicts


def extract_sentence(wrong_kerword_list, sample_text_list):
  match_keyword_dict = {}
  # create file to write cordinate 
  icd_keyword_found_filename = open("icd_keyword_found.txt", "w")
  icd_keyword_found_filename2 = open("icd_keyword_match.txt", "w")
  for key, kerword_set in wrong_kerword_list.items():
    match_dicts = {}
    for key_phrase in kerword_set:
      #print(key, key_phrase)
      keyword_found_output2 = f"Page-{key} | {key_phrase} |\n"
      icd_keyword_found_filename2.write("%s\n" % keyword_found_output2)

      with open(sample_text_list[key], "r") as f:
        file_txt = f.read()
      # match_list = re.findall(f"([^\n]*?(?i){key_phrase}[^.]*\.)", file_txt)
      match_list = re.findall(f"([^\n]*{key_phrase}[^\n]*\n)", file_txt)
      if match_list:
        match_dicts[key_phrase] = [match.replace("\n", "") for match in match_list]
    match_keyword_dict[key] = match_dicts
    keyword_found_output = f"Page-{key} | {key_phrase} | {match_dicts}|\n"
    icd_keyword_found_filename.write("%s\n" % keyword_found_output)

  icd_keyword_found_filename.close()
  icd_keyword_found_filename2.close()
  return match_keyword_dict


def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

##Single Searching & Highlighting

In [5]:
# Step-0: Load prerequisite instance
# create nlp instance
nlp_keyword = spacy.load('en_core_web_sm')

# loading and updating patterns for ICD-10 code
nlp_code10 = English()
nlp_code10.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v3.jsonl")

# loading and updating patterns for ICD-9 code
#nlp_code9 = English()
#nlp_code9.add_pipe("entity_ruler").from_disk("./icd9_code_patterns-v1.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7f4456fba740>

In [33]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [34]:
# Step-1: spliting pdf file
pdf_file_name = "APS_24680000R_final.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

In [35]:
# Step-3: Searching ICD-10 code
page_code10_dict = search_icd_code(txt_list, nlp_code10, code_type="ICD-10")

In [36]:
%%time

# Step-4: Get coloset match of ICD-10 keyword
wrong_keyword_dict = get_wrong_keyword_dict(txt_list)

Error for file[txt-files/page-48.txt] is:
unbalanced parenthesis at position 4
CPU times: user 3min 51s, sys: 902 ms, total: 3min 52s
Wall time: 3min 52s


In [None]:
wrong_keyword_dict

In [29]:
%%time

# Step-5: Extract sentence of ICD-10 keyword
icd_keywords_dict = extract_sentence(wrong_keyword_dict, txt_list)

CPU times: user 10.2 s, sys: 58 ms, total: 10.2 s
Wall time: 10.2 s


In [None]:
icd_keywords_dict[21]

In [31]:
# Step-6: Highlighting ICD-10 code and keyword into pdf
pdf_output_file, txt_output_file = highlight_icd_code_and_keyword(page_code10_dict, 
                                                                  icd9_code_dict=None, 
                                                                  icd_keywords_dict=icd_keywords_dict,
                                                                  pdf_file_name="APS_24680000R_final.pdf", 
                                                                  cords_file_name="APS_24680000R_final_cords.txt")
print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and keyword")
print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

File[APS_24680000R_final_output.pdf] is saved after highlighting ICD-10 code and keyword
Highlighted coordinates are saved into [APS_24680000R_final_cords.txt] file.


##Multiple Searching & Highlighting

In [6]:
# Step-0: Load prerequisite instance
# create nlp instance
nlp_keyword = spacy.load('en_core_web_sm')

# loading and updating patterns for ICD-10 code
nlp_code10 = English()
nlp_code10.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v3.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7f93095124b0>

In [None]:
%%time

for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_name = f"{ocr_pdf_files_path}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = search_icd_code(txt_list, nlp_code10, code_type="ICD-10")

  # Step-4: Get coloset match of ICD-10 keyword
  wrong_keyword_dict = get_wrong_keyword_dict(txt_list)

  # Step-5: Extract sentence of ICD-10 keyword
  icd_keywords_dict = extract_sentence(wrong_keyword_dict, txt_list)

  # Step-6: Highlighting ICD-10 code and keyword into pdf
  pdf_output_file, txt_output_file = highlight_icd_code_and_keyword(icd10_code_dict, 
                                                                    icd9_code_dict=None, 
                                                                    icd_keywords_dict=icd_keywords_dict,
                                                                    pdf_file_name=pdf_file_name, 
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and keyword")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

In [None]:
!mv ocr-pdf-files ocr-pdf-files2

In [8]:
!rm -rf ocr-pdf-files
!mkdir -p ocr-pdf-files

In [None]:
!cp -r ocr-pdf-files2/*.pdf ocr-pdf-files/

In [7]:
!mkdir -p ocr-pdf-files2

In [9]:
purge("ocr-pdf-files/*.txt")
purge("ocr-pdf-files/*_output.pdf")
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [None]:
!zip output.zip ocr-pdf-files/*_cords.txt ocr-pdf-files/*_output.pdf