<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/19_icd_10_code_common_keyword_and_keyword_impairment_highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [1]:
import re
import os
import glob
import json

import numpy as np
import pandas as pd

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

from spacy.lang.en import English

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from concurrent import futures
from keyword_extraction import call
from filter_dict import filter_duplicate_coordinate



In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [5]:
!mkdir -p input_files

##Core Classes

In [2]:
class Highlighter:
    def __init__(self):
      # loading and updating patterns for ICD-10 code
      self.nlp_code10 = English()
      self.nlp_code10.add_pipe("entity_ruler").from_disk("icd10_code_patterns-v5.jsonl")

      # loading stop words
      self.stop_words = set(stopwords.words('english'))

      # define icd-10 code dataset
      self.code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

      # define required directory path
      self.PDF_FILES_PATH = "pdf-files"
      self.TXT_FILES_PATH = "txt-files"
      self.OUTPUT_FILES_PATH = "output"
      create_directory(self.PDF_FILES_PATH)
      create_directory(self.TXT_FILES_PATH)
      create_directory(self.OUTPUT_FILES_PATH)

    def set_impairment_keyword_dict(self, imp_keyword_dict):
      self.impairment_keyword_dict = imp_keyword_dict

    def split_pdf(self, pdf_path):
        pdf_in_file = open(pdf_path, "rb")
        pdf = PdfReader(pdf_in_file)
        pdf_list = []
        for page in range(len(pdf.pages)):
            input_pdf = PdfReader(pdf_in_file)
            output = PdfWriter()
            output.add_page(input_pdf.pages[page])
            with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
                output.write(outputStream)
                pdf_list.append(f"page-{page}.pdf")
        return pdf_list

    def extract_text_from_pdf(self, pdf_list):
        txt_file_list = []
        i = 0
        for pdf_file in pdf_list:
            with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
                pdf = pdftotext.PDF(f)

            # Read all the text into one string
            pdf_text = "\n\n".join(pdf)

            # write text into file
            with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
                f.write(pdf_text)
            txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
            i += 1
        self.text_list = txt_file_list
        return txt_file_list

    def highlight_icd_code(self, icd10_code_dict, match_threshold=30, coordinate=False, pdf_file_name=None, cords_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        # create file to write coordinate
        txt_output_file_name = open(f"{self.OUTPUT_FILES_PATH}/{cords_file_name}", "a")
        json_data = []
        json_imp_data = []
        # add file header
        txt_output_file_name.write("| Page | Found Code | Actual ICD10-Code | ICD 10 description | Common Words | Matched paragraph | Confidence |\n ")

        def highlight_code(page_obj, p_highlight, match_score=0):
            # highlight code if threshold is more than match threshold and common words exists
            if match_score >= match_threshold:
                page_highlight = page_obj.add_highlight_annot(p_highlight)
                page_highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
                page_highlight.update()
            else:
                # highlight and set color coding dont have common words
                page_highlight = page.add_highlight_annot(p_highlight)
                page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                page_highlight.update()

        def highlight_alternate_code(page_obj, p_code):
          for alt_code in self.get_alternate_code_pattern(p_code):
            highlight_list = page.search_for(alt_code)
            for highlight in highlight_list:
              if len(highlight) > 0:
                #print(f"alternate-code:{code}, Coordinate: {highlight}")
                self.code_y2_coords_list.extend([highlight[3]])
                #print(f"[highlight[3]]: {[highlight[3]]}")
                # highlight pdf for option pattern
                highlight_code(page_obj, highlight)

        def highlight_common_words(page_obj, common_words_coord_dict):
            highlight_coords_dict = {}
            highlighted_word_list = []
            prev_common_word = ""
            for common_word, common_word_coord in common_words_coord_dict.items():
                highlight = page_obj.add_highlight_annot(common_word_coord)
                highlight.update()
                if prev_common_word.lower() != common_word.lower():
                  highlight_coords_dict[common_word] = common_word_coord
                  highlighted_word_list.append(common_word)
                  prev_common_word = common_word
            return highlight_coords_dict, highlighted_word_list

        def highlight_keyword_impairment(page_obj, imp_keywords_coord_dict):
            highlight_coords_dict = {}
            for keyword_impairment, imp_keywords_coords in imp_keywords_coord_dict.items():
              #print(f"keyword_impairment-1: {keyword_impairment}")
              for imp_keywords_coord in imp_keywords_coords:
                if self.is_exact_match(page_obj, keyword_impairment, imp_keywords_coord):
                  #print(f"keyword_impairment: {keyword_impairment}")
                  highlight = page_obj.add_highlight_annot(imp_keywords_coord)
                  highlight.set_colors(stroke=[0.529, 0.807, 0.921])  # sky blue
                  highlight.update()
                  highlight_coords_dict[f"{keyword_impairment}"] = f"{imp_keywords_coords}"
            return highlight_coords_dict

        def filter_overlaping_coordinate(highlight_list):
          filtered_coord_list = []
          #if len(highlight_list) > 1:
          for i in range(len(highlight_list)):
            for j in range(i + 1, len(highlight_list)):
              if abs(highlight_list[i][2] - highlight_list[j][2]) > 10:
                filtered_coord_list.append(highlight_list[i])
              else:
                print("#" * 20)
                print(highlight_list[i])
                print("#" * 20)
          return list(set(filtered_coord_list))

        for page_num, page in enumerate(pdf_file):
          already_done_page_list = []
          self.code_y2_coords_list = []
          self.already_found_word_list = []
          code_highlight_dict = {}
          
          # highlight ICD-10 code
          if page_num in icd10_code_dict:
            # let's store code y2 coords 
            for code in icd10_code_dict[page_num]:
              highlight_list = page.search_for(code)
              if len(highlight_list) > 0:
                code_highlight_dict[code] = highlight_list
                self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
              else:
                for alt_code in self.get_alternate_code_pattern(code):
                  highlight_list = page.search_for(alt_code)
                  for highlight in highlight_list:
                    if len(highlight) > 0:
                      code_highlight_dict[code] = [highlight]
                      #print(f"alternate-code:{code}, Coordinate: {highlight}")
                      self.code_y2_coords_list.extend([highlight[3]])
                      #print(f"[highlight[3]]: {[highlight[3]]}")

            filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
            # now, let highlight every code and its common words
            for code, highlight_list in filtered_code_highlight_dict.items():
                num_page = page_num + 1
                print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                # highlight_list = page.search_for(code)
                #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                
                # highlight_list = fn(highlight_list)
                # highlight code that dont have coordinate using alternate pattern
                if len(highlight_list) == 0:
                  highlight_alternate_code(page, code)
                
                for highlight in highlight_list:
                    json_data_object = {}
                    keyword = self.get_keyword(code)
                    # get match score and common words coordinate
                    all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num, code, page, highlight[3], match_threshold)

                    # Step-1: highlight and set color coding dont have common words
                    if not all_match_keyword_list:
                        page_highlight = page.add_highlight_annot(highlight)
                        page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                        page_highlight.update()
                        # prepare json object
                        json_data_object[f"page-{num_page}"] = {
                          "icd10_code": reverse_code_pattern(code),
                          "page_icd10_code": code,
                          "page_icd10_code_coord": f"{highlight}",
                          "icd10_code_desc": keyword if keyword else "UNVALIDATED",
                          "page_icd_common_words": "UNVALIDATED",
                          "page_icd_keywords_coord": {},
                          "page_icd_keywords_found": False,
                          "match_confidence": 0
                        }
                        json_data.append(json_data_object)
                        # log data to file
                        code_cors_output = (f"|Page-{num_page} | {code} | {reverse_code_pattern(code)}"
                                            f"| {keyword if keyword else 'No keyword found'} | 'UNVALIDATED' | 0 |")
                        txt_output_file_name.write("%s\n" % code_cors_output)
                        if coordinate:
                            txt_output_file_name.write("%s\n" % f"|{reverse_code_pattern(code)}:{highlight}|")

                    # Step-2: highlight and set color coding that have common words
                    for match_keyword_dict in all_match_keyword_list:
                      # highlight ICD-10 code
                      highlight_code(page, highlight, match_keyword_dict["score"])
                      # highlight common words
                      highlight_coords_dict = {}
                      highlight_coords_dict, highlighted_word_list = highlight_common_words(page, match_keyword_dict["common_words_coords"])
                      # write all info into text file
                      if match_keyword_dict["score"] >= match_threshold:
                          # | Page | Found Code | Actual ICD10-Code | ICD 10 description | Common Words | confidence |
                          code_cors_output = (f"|Page-{num_page} | {code} | {reverse_code_pattern(code)}"
                                              f" | {keyword if keyword else 'No keyword found'}"
                                              f" | {match_keyword_dict['common_words'] if len(match_keyword_dict['common_words']) > 0 else 'UNVALIDATED'}"
                                              f" | {match_keyword_dict['paragraph']} | {match_keyword_dict['score']} |")
                          txt_output_file_name.write("%s\n" % code_cors_output)
                      else:
                          code_cors_output = (
                              f"|Page-{num_page} | {code} | {reverse_code_pattern(code)} | {keyword if keyword else 'No keyword found'}"
                              f" | {match_keyword_dict['common_words'] if len(match_keyword_dict['common_words']) > 0 else 'UNVALIDATED'}"
                              f" | {match_keyword_dict['paragraph']} | {match_keyword_dict['score']} |")
                          txt_output_file_name.write("%s\n" % code_cors_output)
                      if coordinate:
                          txt_output_file_name.write("%s\n" % f"|{reverse_code_pattern(code)}:{highlight}|")
                          txt_output_file_name.write("%s\n" % f"|{highlight_coords_dict}|")

                      # prepare json object
                      json_data_object[f"page-{num_page}"] = {
                          "icd10_code": reverse_code_pattern(code),
                          "page_icd10_code": code,
                          "page_icd10_code_coord": f"{highlight}",
                          "icd10_code_desc": keyword if keyword else "UNVALIDATED",
                          "page_icd_common_words": list(set([w.lower() for w in match_keyword_dict["common_words"]])) if len(match_keyword_dict["common_words"]) > 0 else 'UNVALIDATED',
                          "page_icd_keywords_coord": f"{highlight_coords_dict}",
                          "page_icd_keywords_found": True if len(match_keyword_dict["common_words"]) > 0 else False,
                          "match_confidence": match_keyword_dict['score']
                      }
                      json_data.append(json_data_object)
                    
                    # Step-3: highlight and set color coding for impairment keyword common words
                    json_imp_data_list = []
                    for match_imp_keyword_dict in all_match_imp_keyword_list:
                      json_imp_data_object = {}
                      # highlight common words
                      highlight_coords_dict = {}
                      if "imp_common_keyword_coords" in match_imp_keyword_dict:
                        highlight_coords_dict = highlight_keyword_impairment(page, match_imp_keyword_dict["imp_common_keyword_coords"])
                        already_done_page_list.append(page_num)
                        if coordinate:
                          txt_output_file_name.write("%s\n" % f"|{highlight_coords_dict}|")
                        # prepare json object
                        # json_imp_data_object[f"page-{num_page}"] = {"impairment_keyword_coords": highlight_coords_dict}
                        if highlight_coords_dict:
                          json_imp_data_list.append(highlight_coords_dict)
                    if len(json_imp_data_list) > 0:
                      json_imp_data.append({
                        f"page-{num_page}": json_imp_data_list
                      })

                    txt_output_file_name.write("\n")  # add extra line on every match code
          
          # highlight ICD key phrase
          if page_num in self.impairment_keyword_dict and page_num not in already_done_page_list:
            json_imp_data_list = []
            num_page = page_num + 1
            key_phrase_sents = self.impairment_keyword_dict[page_num]
            for key_phrase_sent in key_phrase_sents:
              coordinates = page.search_for(key_phrase_sent)
              cords_list = []
              for inst in coordinates:
                # ALL, all, All, Authorization
                if self.is_exact_match(page, key_phrase_sent, inst):

                  highlight = page.add_highlight_annot(inst)
                  highlight.set_colors(stroke=[0.529, 0.807, 0.921])  # sky blue
                  highlight.update()
                  cords_list.append(inst)
              if len(cords_list) > 0:
                json_imp_data_list.append({key_phrase_sent: f"{cords_list}"})
            if len(json_imp_data_list) > 0:
              json_imp_data.append({
                f"page-{num_page}": json_imp_data_list
              })

        # build and write json file
        json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}.json"
        with open(json_data_dump_file, "w") as json_out_file:
          json.dump(json_data, json_out_file)

        json_imp_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_impairment.json"
        with open(json_imp_data_dump_file, "w") as json_out_file:
          json.dump(json_imp_data, json_out_file)

        # close log file
        txt_output_file_name.close()
        # create highlighted pdf file
        pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_output.pdf"
        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

        return pdf_output_file_name, cords_file_name

    def get_best_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
      all_match_keyword_list = []
      all_match_imp_keyword_list = []

      # Step 1: reverse code pattern
      reversed_icd_code = reverse_code_pattern(p_code)
      # Step 2: fetch keyword based on code
      keyword = self.get_keyword(reversed_icd_code)
      # Step 3: get code paragraph
      code_paragraph_list, non_code_paragraph_list = self.get_paragraph(page_obj, code_y2_coord)
      #print(f"code_paragraph_list: {non_code_paragraph_list}")
      
      # Step 4: prepare code_paragraph for common words
      for code_paragraph in code_paragraph_list:
        match_keyword_dict = {}
        common_words = self.get_common_words(keyword, code_paragraph)
        if len(common_words) > 0:
          match_keyword_dict["common_words"] = common_words
          # Step 4: get best token match ratio
          clean_paragraph = " ".join(self.clean_text(code_paragraph))
          match_keyword_dict["paragraph"] = clean_paragraph
          # match_score = fuzz.token_set_ratio(keyword, clean_paragraph) 
          # if fuzz.token_set_ratio(keyword, clean_paragraph) >= match_threshold else 0
          match_keyword_dict["score"] = fuzz.token_set_ratio(keyword, clean_paragraph) 
          # Step 5: build common words coordinate dict
          common_words_coord_dict = {}
          for common_word in common_words:
            highlight_list = page_obj.search_for(common_word)
            for highlight in highlight_list:
              # get common word y2 coord value
              common_word_y2_coords = highlight[3]
              if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                  common_words_coord_dict[common_word] = highlight
                  #self.already_found_word_list.append(common_word)
          match_keyword_dict["common_words_coords"] = common_words_coord_dict
          all_match_keyword_list.append(match_keyword_dict)

      def get_keyword_impairment_list(p_paragraph):
        keyword_impairments = []
        for keyword_impairment in self.impairment_keyword_dict[page_num]:
          keyword_impairment_found = self.get_common_words(keyword_impairment, p_paragraph)
          if len(keyword_impairment_found) > 0:
            keyword_impairments.append(keyword_impairment)
        return keyword_impairments

      # Step 5: prepare non code_paragraph for common words
      for non_code_paragraph in non_code_paragraph_list:
        match_imp_keyword_dict = {}
        imp_common_words_coord_dict = {}
        keyword_impairment_list = get_keyword_impairment_list(non_code_paragraph)
        for keyword_impairment in keyword_impairment_list:
          #print(f"keyword_impairment11: {keyword_impairment}")
          highlight_list = page_obj.search_for(keyword_impairment)
          #print(f"highlight_list: {highlight_list}")
          #print(f"code_y2_coords_list: {self.code_y2_coords_list}")
          highlight_list = [highlight for highlight in highlight_list if highlight[3] not in self.code_y2_coords_list]
          #print(f"already_found_word_list: {self.already_found_word_list}")
          #print(f"highlight_list: {highlight_list}")
          if keyword_impairment not in self.already_found_word_list and len(highlight_list) > 0:
            self.already_found_word_list.append(keyword_impairment)
            imp_common_words_coord_dict[f"{keyword_impairment}"] = highlight_list
            #print(f"keyword_impairment22: {keyword_impairment}, coords: {highlight_list}")
            continue
        match_imp_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
        all_match_imp_keyword_list.append(match_imp_keyword_dict)

      return all_match_keyword_list, all_match_imp_keyword_list

    def search_icd_code(self, txt_list):
      pdf_page_vocab = {}
      for txt_file in txt_list:
          with open(txt_file, "r") as f:
              page_txt = f.read()

              # check the page that have line number instead of code
              index_page = False
              if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
                  index_page = True

              doc = self.nlp_code10(page_txt)
              code_list = []
              for ent in doc.ents:
                  if index_page:
                      # check the code contain letter "L"
                      if re.search("(L[0-9]+)", ent.text):
                          continue
                      else:
                          code_list.append(ent.text)
                  else:
                      code_list.append(ent.text)

              # code_list = [ent.text for ent in doc.ents if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", ent.text)]
              if len(code_list) != 0:
                  page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
                  pdf_page_vocab[page_number] = list(set(code_list))
                  # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
      return pdf_page_vocab

    def get_keyword(self, p_code):
        keyword = ""
        # reverse code if required
        code = reverse_code_pattern(p_code)
        # get keyword from dataset
        keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
        if len(keyword_list) > 0:
            keyword = keyword_list[0]
        return keyword

    def get_paragraph(self, page_obj, code_y2_coord):
        code_paragraph_list = []
        non_code_paragraph_list = []
        page_content = page_obj.get_text("blocks", sort=False)
        for content in page_content:
          #print(f"content found: {content[4]}")
          if content[1] <= code_y2_coord <= content[3]:
              if len(self.clean_text(content[4])) > 0:
                  code_paragraph = content[4]
                  code_paragraph_list.append(code_paragraph)
          else:
            if len(self.clean_text(content[4])) > 0:
              non_code_paragraph_list.append(content[4])
        return code_paragraph_list, non_code_paragraph_list

    def get_common_words(self, sent1, sent2):
        clean_token1 = self.clean_text(sent1)
        clean_token2 = self.clean_text(sent2)
        token_set1 = set(clean_token1)
        token_set2 = set(clean_token2)

        common_word_set = set()

        def get_common(token_set1, token_set2):
            for w1 in token_set1:
                for w2 in token_set2:
                    if w1.lower() == w2.lower():
                        common_word_set.add(w1)

        get_common(token_set1, token_set2)
        get_common(token_set2, token_set1)
        return list(common_word_set)

    def clean_text(self, sent):
        # tokenize sentence
        sent1 = word_tokenize(sent)
        # filter stop words
        filtered_sent = [w for w in sent1 if not w.lower() in self.stop_words]
        filtered_sent = [w for w in filtered_sent if re.sub(re.compile('\W'), '', w)]
        clean_tokens = []
        for token in filtered_sent:
            if token.find("-"):
                tokens = token.split("-")
                clean_tokens.extend(tokens)
            else:
                clean_tokens.append(token)
        return clean_tokens

    def is_exact_match(self, page, term, clip):
      # clip is an item from page.search_for(term, quads=True)
      termLen = len(term)
      termBboxLen = max(clip.height, clip.width)
      termfontSize = termBboxLen/termLen
      f = termfontSize*2
      #clip = clip.rect
      text_block = page.get_text("blocks", clip = clip + (-f, -f, f, f), flags=0)[0][4]
      #re.sub(r"[^a-zA-Z\d\s:]", "", "(HIV]")
      # if re.sub(r"[^a-zA-Z\d\s:]", "", text_block.strip()) in [t for t in term.split()]:
      if re.sub(r"[^a-zA-Z\d\s:]", "", text_block.lower().strip()) in [t.lower() for t in term.split()]:
        return True
      else:
        return False

    def get_alternate_code_pattern(self, p_code):
      # create alternate pattern
      code_patterns = []
      code_arr = p_code.split(".")
      if len(code_arr) > 1:
        code1 = f"{code_arr[0]}. {code_arr[1]}"
        code2 = f"{code_arr[0]} .{code_arr[1]}"
        code3 = f"{code_arr[0]} . {code_arr[1]}"
        code4 = f"{code_arr[0]} {code_arr[1]}"
        code44 = f"{code_arr[0]},{code_arr[1]}"
        code45 = f"{code_arr[0]}, {code_arr[1]}"
        code46 = f"{code_arr[0]} ,{code_arr[1]}"
        code47 = f"{code_arr[0]} , {code_arr[1]}"
        code_patterns.extend([code1, code2, code3, code4, code44, code45, code46, code47])
        # handle if the first char of code is missing
        alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
        for key, val in alphabats.items():
          if p_code.startswith(key):
            code5 = p_code.replace(key, val)
            code_patterns.extend([code5])
          # replcae char on 1 index if it is not present in icd9 code dataset
          if p_code.find(val) == 1:
            code6 = replacer(p_code, key, 1)
            code_patterns.extend([code6])
            # replcae char on 2 index
            if p_code.find(val) == 2:
              code7 = replacer(code6, key, 2)
              code_patterns.extend([code7])
      return code_patterns

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


def reverse_code_pattern(p_code):
    orig_code = p_code

    # check for code contains space(" ")
    tmp_code = orig_code.split(" ")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains dot(".")
    tmp_code = p_code.split(".")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains comma(",")
    tmp_code = p_code.split(",")
    if len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
    elif len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

    # handle if the first char of code is missing
    alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
    for key, val in alphabats.items():
        # replcae char on 0 index
        if orig_code.find(val) == 0:
            # orig_code = orig_code.replace(val, key)
            orig_code = replacer(orig_code, key, 0)
        # replcae char on 1 index
        if orig_code.find(key) == 1:
            orig_code = replacer(orig_code, val, 1)
            # replcae char on 2 index
            if orig_code.find(key) == 2:
                orig_code = replacer(orig_code, val, 2)
            break

    return orig_code


def replacer(s, newstring, index, nofail=False):
    # raise an error if index is outside of the string
    if not nofail and index not in range(len(s)):
        raise ValueError("index outside given string")

    # if not erroring, but the index is still not in the correct range..
    if index < 0:  # add it to the beginning
        return newstring + s
    if index > len(s):  # add it to the end
        return s + newstring

    # insert the new string between "slices" of the original
    return s[:index] + newstring + s[index + 1:]


def create_directory(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [None]:
re.sub(r"[^a-zA-Z\d\s:]", "", "back".strip()) in ["Back", "Pain"]

False

##Sentence Extractor

In [3]:
class SentenceExtractor:
  def __init__(self):
    self.MAX_WORKERS = 20

  def get_json_array_list(self, text_path):
    json_arr = None
    try:
      # print(f"Running '{text_path}'")
      json_arr = call(text_path)
      #print(f"Got json for '{json_arr}'")
    except Exception as err:
      print(f"Error for file[{text_path}] is:\n{err}")
    return json_arr

  def get_wrong_keyword_dict(self, text_files_list, with_thread=False, with_process=False):
      def get_sorted_dict(p_json_arr_list):
          wrong_keyword_dict = {
              idx: set([list(element.values())[0] for element in json_arr if json_arr])
              for idx, json_arr in enumerate(p_json_arr_list)
          }
          return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

      if with_thread:
          # take care so that unnecessary thread should not be created
          workers = min(self.MAX_WORKERS, len(text_files_list))
          with futures.ThreadPoolExecutor(max_workers=workers) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      if with_process:
          with futures.ProcessPoolExecutor(max_workers=4) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      else:
          json_arr_list = list(map(self.get_json_array_list, text_files_list))
          tmp_wrong_keyword_dict = {
              idx: set([list(element.values())[0] for element in json_arr if json_arr is not None])
              for idx, json_arr in enumerate(json_arr_list)
          }
          return tmp_wrong_keyword_dict

  def extract_sentence(self, wrong_keyword_list, sample_text_list):
      match_keyword_dict = {}
      for key, keyword_set in wrong_keyword_list.items():
          match_dicts = {}
          for key_phrase in keyword_set:
              # print(key, key_phrase)
              with open(sample_text_list[key], "r") as f:
                  file_txt = f.read()
              # match_list = re.findall(f"([^\n]*?(?i){key_phrase}[^.]*\.)", file_txt)
              match_list = re.findall(f"([^\n]*{key_phrase}[^\n]*\n)", file_txt)
              if match_list:
                  match_dicts[key_phrase] = [match.replace("\n", "") for match in match_list]
          match_keyword_dict[key] = match_dicts
      return match_keyword_dict

##Keyword Matching & Highlighting 

In [4]:
!rm -rf input_files
!mkdir -p input_files
!rm -rf output
!mkdir -p output

In [5]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [6]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"

highlighter = Highlighter()
sent_extractor = SentenceExtractor()

In [7]:
%%time
wrong_keyword_dict1 = None
icd10_code_dict1 = None
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = highlighter.search_icd_code(txt_list)
  icd10_code_dict1 = icd10_code_dict

  # Step-4: Get closet match of ICD-10 keyword
  wrong_keyword_dict = sent_extractor.get_wrong_keyword_dict(txt_list)
  wrong_keyword_dict1 = wrong_keyword_dict
  highlighter.set_impairment_keyword_dict(wrong_keyword_dict)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    match_threshold=35,
                                                                    coordinate=True,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

Page-8-Z20.822, Coordinate: [Rect(311.13006591796875, 114.6400146484375, 337.1675720214844, 122.6439208984375)]
Page-8-R05.9, Coordinate: [Rect(135.0170440673828, 307.5999450683594, 152.56533813476562, 315.6038513183594)]
Page-8-E78.3, Coordinate: [Rect(226.35824584960938, 520.47998046875, 243.45660400390625, 528.48388671875)]
Page-9-E78.3, Coordinate: [Rect(226.3582305908203, 610.0399780273438, 243.45651245117188, 617.0433959960938)]
Page-10-E78.3, Coordinate: [Rect(226.35824584960938, 492.6400146484375, 243.45660400390625, 500.6439208984375)]
Page-11-R05.9, Coordinate: [Rect(135.01699829101562, 426.8800048828125, 152.56529235839844, 434.8839111328125)]
Page-11-Z12.5, Coordinate: [Rect(127.9977035522461, 609.760009765625, 145.24603271484375, 617.763916015625)]
Page-11-200.00, Coordinate: [Rect(344.1510009765625, 599.4400024414062, 364.45306396484375, 607.4439086914062)]
Page-12-Z00.00, Coordinate: [Rect(344.9269104003906, 74.320068359375, 365.5649108886719, 82.323974609375), Rect(344.

In [None]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [8]:
!zip output.zip output/*.*

  adding: output/APS_38600000R_final_cords.txt (deflated 79%)
  adding: output/APS_38600000R_final_impairment.json (deflated 71%)
  adding: output/APS_38600000R_final.json (deflated 84%)
  adding: output/APS_38600000R_final_output.pdf (deflated 2%)


In [None]:
icd10_code_dict1[28]

['R03.0', '268.25', 'R53.83', '278.9', 'E78.3']

In [9]:
wrong_keyword_dict1[1]

{'AH',
 'AIDS',
 'ALL',
 'HHS',
 'HIV',
 'Human Immunodeficiency Virus',
 'Infectious Disease',
 'TEN'}

In [None]:
wrong_keyword_dict1[60]

{'Atrial fibrillation',
 'Back Pain',
 'HTN',
 'Headache',
 'Hypertension',
 'Hypertriglyceridemia',
 'Neurological symptoms',
 'Overweight',
 'Sciatica',
 'WAS'}

In [None]:
wrong_keyword_dict1[0]

{'Atrial Fibrillation',
 'Headache',
 'Hypertension',
 'Hypertriglyceridemia',
 'MD',
 'MI',
 'WAS'}

In [None]:
data_df = pd.read_csv("data.csv")
print(data_df.shape)
data_df.head()

(83, 5)


Unnamed: 0,code,x1,y1,x2,y2
0,287.5,251.34584,460.959991,268.594238,468.963898
1,L73.9,116.202209,400.959991,132.06749,408.963898
2,S93.411A,328.746582,614.559998,359.288788,622.563904
3,K46.9,321.942444,554.799988,338.207672,562.803894
4,R50.9,100.550339,350.079987,117.348732,358.083893


In [None]:
with open("kcaps.txt", "r") as f:
  all_text = f.read()
keyword_caps_list = all_text.split("\n")

In [None]:
def is_upper_keyword(p_keyword):
  if len(p_keyword.split()) == 1 and p_keyword.isupper():
    return True
  else:
    return False

In [None]:
is_upper_keyword("Authorazation")

##ICD 10 Code

In [None]:
!mkdir -p pdf-files
!mkdir -p txt-files

In [None]:
# define directory path after creating it
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

# create nlp instance
nlp = English()


def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfReader(pdf_in_file)
  pdf_list = []
  for page in range(len(pdf.pages)):
      inputpdf = PdfReader(pdf_in_file)
      output = PdfWriter()
      output.add_page(inputpdf.pages[page])
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
        with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
            pdf = pdftotext.PDF(f)

        # Read all the text into one string
        pdf_text = "\n\n".join(pdf)

        # write text into file
        with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
            f.write(pdf_text)
        txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
        i += 1
    return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def highlight_icd10_code(pdf_page_dict: dict, pdf_file_name: str):
    pdf_file = fitz.open(pdf_file_name)

    def highlight_pdf(highlight):
        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.update()
          highlight = page.search_for(text_to_be_highlighted)
          print(f"Page-{page_num}: ", code, highlight, end='\n')

    for page_num, page in enumerate(pdf_file):
        if page_num in pdf_page_dict:
          for code in pdf_page_dict[page_num]:
            text_to_be_highlighted = code
            highlight = page.search_for(text_to_be_highlighted)
            print(f"Page-{page_num}: ", code, highlight, end='\n')
            if len(highlight) == 0:
                alternate_code_list = get_opt_pattern(code)
                for alt_code in alternate_code_list:
                  text_to_be_highlighted = alt_code
                  highlight = page.search_for(text_to_be_highlighted)
                  # highlight pdf for option pattern
                  highlight_pdf(highlight)
            # highlight pdf for main pattern
            highlight_pdf(highlight)

    output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
    pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)
    return output_pdf_file_name


def search_icd_10_code(txt_list):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          #print(txt_file)
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [None]:
# Step-1: splitting pdf file
pdf_file_name = "28page.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v1.jsonl")

# Step-4: Searching ICD-10 code
#print (txt_list)
pdf_page_vocab = search_icd_10_code(txt_list)

# Step-5: Highlighting ICD-10 code into pdf
output_file_name = highlight_icd10_code(pdf_page_vocab, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code")

Page-0:  R53.83 [Rect(447.8690185546875, 54.60003662109375, 471.3388671875, 63.60443115234375)]
Page-0:  R53.83 [Rect(447.8690185546875, 54.60003662109375, 471.3388671875, 63.60443115234375)]
Page-0:  R03.0 [Rect(287.4623718261719, 223.8399658203125, 305.3106994628906, 231.8438720703125)]
Page-0:  R03.0 [Rect(287.4623718261719, 223.8399658203125, 305.3106994628906, 231.8438720703125)]
Page-0:  E78.3 []
Page-0:  E78.3 [Rect(151.10546875, 232.47998046875, 173.5032958984375, 240.48388671875)]
Page-0:  R53.83 [Rect(447.8690185546875, 54.60003662109375, 471.3388671875, 63.60443115234375)]
Page-0:  R53.83 [Rect(447.8690185546875, 54.60003662109375, 471.3388671875, 63.60443115234375)]
Page-0:  R03.0 [Rect(287.4623718261719, 223.8399658203125, 305.3106994628906, 231.8438720703125)]
Page-0:  R03.0 [Rect(287.4623718261719, 223.8399658203125, 305.3106994628906, 231.8438720703125)]
Page-0:  E78.3 []
Page-0:  E78.3 [Rect(151.10546875, 232.47998046875, 173.5032958984375, 240.48388671875)]
Page-0:  R