<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/19_icd_10_code_common_keyword_and_keyword_impairment_highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [1]:
!unzip data.zip

Archive:  data.zip
   creating: data/csv_files/
  inflating: data/csv_files/icd_10_code_and_keywords_v2.csv  
  inflating: data/csv_files/icd_10_code_and_keywords_v3.csv  
  inflating: data/csv_files/synid_and_keywords_impairment.csv  
   creating: data/pattern_files/
  inflating: data/pattern_files/icd10_code_patterns-v6.jsonl  
   creating: data/pkl_files/
  inflating: data/pkl_files/dict_words.pkl  
  inflating: data/pkl_files/knocaps_dict_index.pickle  
   creating: data/txt_files/
  inflating: data/txt_files/kcaps.txt  
  inflating: data/txt_files/knocaps.txt  


In [1]:
import re
import os
import glob
import json
import logging
from functools import lru_cache

import numpy as np
import pandas as pd

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

from spacy.lang.en import English

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from dataclasses import dataclass
from array import *

from concurrent import futures
from keyword_matcher import KeywordMatcher
import config as cfg



In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

##Core Classes

In [2]:
class Highlighter:
    def __init__(self, match_threshold=30):
        # loading and updating patterns for ICD-10 code
        self.nlp_code10 = English()
        self.nlp_code10.add_pipe("entity_ruler").from_disk(cfg.pattern_files["jsonl"])

        # loading stop words
        self.stop_words = set(stopwords.words('english'))
        self.match_threshold = match_threshold

        # define icd-10 code dataset
        core_df = pd.read_csv(cfg.csv_files["CODE_CSV"])
        self.code_df = core_df.where(pd.notnull(core_df), None)
        self.synid_df = pd.read_csv(cfg.csv_files["IMP_CSV"])

        # define required directory path
        self.PDF_FILES_PATH = "pdf-files"
        self.TXT_FILES_PATH = "txt-files"
        self.OUTPUT_FILES_PATH = cfg.output_path["OUT"]
        create_directory(self.PDF_FILES_PATH)
        create_directory(self.TXT_FILES_PATH)
        create_directory(self.OUTPUT_FILES_PATH)

    def set_impairment_keyword_dict(self, imp_keyword_dict):
        self.impairment_keyword_dict = imp_keyword_dict

    def set_match_threshold(self, match_val):
        self.match_threshold = match_val

    def split_pdf(self, pdf_path):
        pdf_in_file = open(pdf_path, "rb")
        pdf = PdfReader(pdf_in_file)
        pdf_list = []
        for page in range(len(pdf.pages)):
            input_pdf = PdfReader(pdf_in_file)
            output = PdfWriter()
            output.add_page(input_pdf.pages[page])
            with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
                output.write(outputStream)
                pdf_list.append(f"page-{page}.pdf")
        return pdf_list

    def extract_text_from_pdf(self, pdf_list):
        txt_file_list = []
        i = 0
        for pdf_file in pdf_list:
            with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
                pdf = pdftotext.PDF(f)

            # Read all the text into one string
            pdf_text = "\n\n".join(pdf)

            # write text into file
            with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
                f.write(pdf_text)
            txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
            i += 1
        self.text_list = txt_file_list
        return txt_file_list

    def highlight_all(self, icd10_code_dict, pdf_file_name=None, highlight_all=False):
        pdf_file = fitz.open(pdf_file_name)
        json_data = []
        json_imp_data = []

        for page_num, page in enumerate(pdf_file):
            already_done_page_list = []
            self.code_y2_coords_list = []
            self.already_found_word_list = []
            self.highlighted_coord_list = []
            code_highlight_dict = {}

            # highlight ICD-10 code
            if page_num in icd10_code_dict:
                num_page = page_num + 1

                # let's store code y2 coords
                for code in icd10_code_dict[page_num]:
                    highlight_list = page.search_for(code)
                    #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                    if len(highlight_list) > 0:
                        code_highlight_dict[code] = highlight_list
                        self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                    else:
                        for alt_code in self.get_alternate_code_pattern(code):
                            highlight_list = page.search_for(alt_code)
                            for highlight in highlight_list:
                                if len(highlight) > 0:
                                    code_highlight_dict[code] = [highlight]
                                    # print(f"alternate-code:{code}, Coordinate: {highlight}")
                                    self.code_y2_coords_list.extend([highlight[3]])
                                    # print(f"[highlight[3]]: {[highlight[3]]}")

                filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
                # now, let highlight every code and its common words
                for code, highlight_list in filtered_code_highlight_dict.items():
                    prev_highlight = fitz.Rect(0, 0, 0, 0)
                    for highlight in highlight_list:
                        json_data_object = {}
                        keyword = self.get_keyword(code)
                        # get match score and common words coordinate
                        if not highlight_all:
                          all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num,
                                                                                                        code,
                                                                                                        page,
                                                                                                        highlight[3],
                                                                                                        self.match_threshold)
                        else:
                          all_match_keyword_list, all_match_imp_keyword_list = self.get_best_all_token_match(page_num,
                                                                                                        code,
                                                                                                        page,
                                                                                                        highlight[3],
                                                                                                        self.match_threshold)

                        # Step-1: highlight and set color coding dont have common words
                        if not all_match_keyword_list:
                            page_highlight = page.add_highlight_annot(highlight)
                            page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                            page_highlight.update()
                            # prepare json object
                            json_data.append(self.prepare_json_object(num_page, code, highlight, keyword))

                        # Step-2: highlight and set color coding that have common words
                        for match_keyword_dict in all_match_keyword_list:
                            # highlight ICD-10 code
                            if not self.is_coord_equal(prev_highlight, highlight):
                              self.highlight_code(page, highlight, match_keyword_dict["score"])
                              prev_highlight = highlight
                            # highlight common words
                            highlight_coords_dict = {}
                            highlight_coords_dict, highlighted_word_list = self.highlight_common_words(page, match_keyword_dict["common_words_coords"])
                            # prepare json object
                            json_data.append(self.prepare_json_object(num_page, code, highlight, keyword, match_keyword_dict, highlight_coords_dict))

                        # Step-3: highlight and set color coding for impairment keyword common words
                        json_imp_data_list = []
                        for match_imp_keyword_dict in all_match_imp_keyword_list:
                            json_imp_data_object = {}
                            # highlight common words
                            highlight_coords_dict = {}
                            if "imp_common_keyword_coords" in match_imp_keyword_dict:
                                #print(f"match_imp_keyword_dict: {match_imp_keyword_dict}")
                                highlight_coords_list = self.highlight_impairment(page, match_imp_keyword_dict["imp_common_keyword_coords"])
                                already_done_page_list.append(page_num)
                                # prepare json object
                                # json_imp_data_object[f"page-{num_page}"] = {"impairment_keyword_coords": highlight_coords_dict}
                                if highlight_coords_list:
                                    json_imp_data_list.extend(highlight_coords_list)
                        if len(json_imp_data_list) > 0:
                            json_imp_data.append({
                                f"page-{num_page}": json_imp_data_list
                            })

            # highlight ICD key phrase
            if page_num in self.impairment_keyword_dict and page_num not in already_done_page_list:
                json_imp_data_list = []
                num_page = page_num + 1
                key_phrase_sents = self.impairment_keyword_dict[page_num]
                for key_phrase_sent in key_phrase_sents:
                  coordinates = page.search_for(key_phrase_sent)
                  cords_list = []
                  for inst in coordinates:
                    key_phrase_list = key_phrase_sent.split()
                    for key_phrase in key_phrase_list:
                      if self.is_exact_match(page, key_phrase, inst, full_match=True, case_sensitive=True) and not self.is_coord_already_highlighted(inst):
                        self.draw_keyword_box(page, inst)
                        cords_list.append(inst)
                        self.highlighted_coord_list.append(inst)
                  if len(cords_list) > 0:
                    # print(f"1-key_phrase_sent: {key_phrase_sent}, coordinates: {cords_list}")
                    # self.draw_keyword_box(page, cords_list, key_phrase_sent)
                    json_imp_data_list.append(self.prepare_imp_json_object(key_phrase_sent, cords_list))
                if len(json_imp_data_list) > 0:
                    json_imp_data.append({
                        f"page-{num_page}": json_imp_data_list
                    })

        # combine both json object and write into json file
        json_data_dump_file = None
        if highlight_all:
            json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.json_output['OUT_4']}"
        else:
            json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.json_output['OUT_1']}"

        with open(json_data_dump_file, "w") as json_out_file:
            json.dump({
              "icd10_code_and_description": json_data,
              "keyword_impairment": json_imp_data
            }, json_out_file)

        # create highlighted pdf file
        pdf_output_file_name = None
        if highlight_all:
            pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.pdf_output['OUT_4']}"
        else:
            pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.pdf_output['OUT_1']}"

        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

        return pdf_output_file_name, json_data_dump_file

    def highlight_icd_code_and_common_words(self, icd10_code_dict, pdf_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        json_data = []

        for page_num, page in enumerate(pdf_file):
            self.code_y2_coords_list = []
            self.already_found_word_list = []
            code_highlight_dict = {}

            # highlight ICD-10 code
            if page_num in icd10_code_dict:
                num_page = page_num + 1
                # let's store code y2 coords
                for code in icd10_code_dict[page_num]:
                    highlight_list = page.search_for(code)
                    #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                    if len(highlight_list) > 0:
                        code_highlight_dict[code] = highlight_list
                        self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                    else:
                        for alt_code in self.get_alternate_code_pattern(code):
                            highlight_list = page.search_for(alt_code)
                            for highlight in highlight_list:
                                if len(highlight) > 0:
                                    code_highlight_dict[code] = [highlight]
                                    #print(f"alternate-code:{code}, Coordinate: {highlight}")
                                    self.code_y2_coords_list.extend([highlight[3]])

                filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
                # now, let highlight every code and its common words
                for code, highlight_list in filtered_code_highlight_dict.items():
                    prev_highlight = fitz.Rect(0, 0, 0, 0)
                    for highlight in highlight_list:
                        json_data_object = {}
                        keyword = self.get_keyword(code)
                        # get match score and common words coordinate
                        all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num, code,
                                                                                                       page,
                                                                                                       highlight[3],
                                                                                                       self.match_threshold)

                        # Step-1: highlight and set color coding dont have common words
                        if not all_match_keyword_list:
                            page_highlight = page.add_highlight_annot(highlight)
                            page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                            page_highlight.update()
                            # prepare json object
                            json_data.append(self.prepare_json_object(num_page, code, highlight, keyword))

                        # Step-2: highlight and set color coding that have common words
                        for match_keyword_dict in all_match_keyword_list:
                            # highlight ICD-10 code
                            if not self.is_coord_equal(prev_highlight, highlight):
                              self.highlight_code(page, highlight, match_keyword_dict["score"])
                              prev_highlight = highlight
                            # highlight common words
                            highlight_coords_dict = {}
                            highlight_coords_dict, highlighted_word_list = self.highlight_common_words(page, match_keyword_dict["common_words_coords"])

                            # prepare json object
                            json_data.append(self.prepare_json_object(num_page, code, highlight, keyword, match_keyword_dict, highlight_coords_dict))

        # build and write json file
        json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.json_output['OUT_2']}"
        with open(json_data_dump_file, "w") as json_out_file:
          json.dump(json_data, json_out_file)

        # create highlighted pdf file
        pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.pdf_output['OUT_2']}"
        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

        return pdf_output_file_name, json_data_dump_file

    def highlight_icd_code_and_keyword_impairment(self, icd10_code_dict, pdf_file_name=None):
      pdf_file = fitz.open(pdf_file_name)
      json_data = []
      json_imp_data = []

      for page_num, page in enumerate(pdf_file):
        already_done_page_list = []
        self.code_y2_coords_list = []
        self.already_found_word_list = []
        self.highlighted_coord_list = []
        code_highlight_dict = {}

        # highlight ICD-10 code
        if page_num in icd10_code_dict:
            num_page = page_num + 1

            # let's store code y2 coords
            for code in icd10_code_dict[page_num]:
                highlight_list = page.search_for(code)
                #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                if len(highlight_list) > 0:
                    code_highlight_dict[code] = highlight_list
                    self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                else:
                    for alt_code in self.get_alternate_code_pattern(code):
                        highlight_list = page.search_for(alt_code)
                        for highlight in highlight_list:
                            if len(highlight) > 0:
                                code_highlight_dict[code] = [highlight]
                                # print(f"alternate-code:{code}, Coordinate: {highlight}")
                                self.code_y2_coords_list.extend([highlight[3]])
                                # print(f"[highlight[3]]: {[highlight[3]]}")

            filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
            # now, let highlight every code and its common words
            for code, highlight_list in filtered_code_highlight_dict.items():
                prev_highlight = fitz.Rect(0, 0, 0, 0)
                for highlight in highlight_list:
                    keyword = self.get_keyword(code)

                    all_match_keyword_list, all_match_imp_keyword_list = self.get_best_all_token_match(page_num,
                                                                                                    code,
                                                                                                    page,
                                                                                                    highlight[3],
                                                                                                    self.match_threshold)
                    # Step-1: highlight ICD-10 code
                    for match_keyword_dict in all_match_keyword_list:
                      if not self.is_coord_equal(prev_highlight, highlight):
                        self.highlight_code(page, highlight, match_keyword_dict["score"])
                        prev_highlight = highlight
                      # prepare json object
                      json_data.append(self.prepare_code_json_object(num_page, code, highlight))

                    # Step-2: highlight and set color coding for impairment keyword common words
                    json_imp_data_list = []
                    for match_imp_keyword_dict in all_match_imp_keyword_list:
                        json_imp_data_object = {}
                        # highlight common words
                        highlight_coords_dict = {}
                        if "imp_common_keyword_coords" in match_imp_keyword_dict:
                            #print(f"match_imp_keyword_dict: {match_imp_keyword_dict}")
                            highlight_coords_list = self.highlight_impairment(page, match_imp_keyword_dict["imp_common_keyword_coords"])
                            already_done_page_list.append(page_num)
                            # prepare json object
                            if highlight_coords_list:
                                json_imp_data_list.extend(highlight_coords_list)
                    if len(json_imp_data_list) > 0:
                        json_imp_data.append({
                            f"page-{num_page}": json_imp_data_list
                        })

        # highlight ICD key phrase
        if page_num in self.impairment_keyword_dict and page_num not in already_done_page_list:
            json_imp_data_list = []
            num_page = page_num + 1
            key_phrase_sents = self.impairment_keyword_dict[page_num]
            for key_phrase_sent in key_phrase_sents:
              coordinates = page.search_for(key_phrase_sent)
              cords_list = []
              for inst in coordinates:
                key_phrase_list = key_phrase_sent.split()
                for key_phrase in key_phrase_list:
                  if self.is_exact_match(page, key_phrase, inst, full_match=True, case_sensitive=True) and not self.is_coord_already_highlighted(inst):
                    self.draw_keyword_box(page, inst)
                    cords_list.append(inst)
                    self.highlighted_coord_list.append(inst)
              if len(cords_list) > 0:
                # print(f"1-key_phrase_sent: {key_phrase_sent}, coordinates: {cords_list}")
                # self.draw_keyword_box(page, cords_list, key_phrase_sent)
                json_imp_data_list.append(self.prepare_imp_json_object(key_phrase_sent, cords_list))
            if len(json_imp_data_list) > 0:
                json_imp_data.append({
                    f"page-{num_page}": json_imp_data_list
                })

      try:
        # combine both json object and write into json file
        json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.json_output['OUT_5']}"

        with open(json_data_dump_file, "w") as json_out_file:
            json.dump({
              "icd10_code": json_data,
              "keyword_impairment": json_imp_data
            }, json_out_file)

        # create highlighted pdf file
        pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.pdf_output['OUT_5']}"

        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)
      except Exception as ex:
        print(f"ERROR: {ex}")

      return pdf_output_file_name, json_data_dump_file

    def highlight_keyword_impairment(self, pdf_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        json_imp_data = []
        for page_num, page in enumerate(pdf_file):
          self.highlighted_coord_list = []
          # highlight ICD key phrase
          if page_num in self.impairment_keyword_dict:
            json_imp_data_list = []
            num_page = page_num + 1
            key_phrase_sents = self.impairment_keyword_dict[page_num]
            for key_phrase_sent in key_phrase_sents:
              coordinates = page.search_for(key_phrase_sent)
              cords_list = []
              for inst in coordinates:
                key_phrase_list = key_phrase_sent.split()
                for key_phrase in key_phrase_list:
                  if self.is_exact_match(page, key_phrase, inst, full_match=True, case_sensitive=True) and not self.is_coord_already_highlighted(inst):
                    self.draw_keyword_box(page, inst)
                    cords_list.append(inst)
                    self.highlighted_coord_list.append(inst)
              if len(cords_list) > 0:
                # print(f"2-key_phrase_sent: {key_phrase_sent}, coordinates: {cords_list}")
                # self.draw_keyword_box(page, cords_list, key_phrase_sent)
                json_imp_data_list.append(self.prepare_imp_json_object(key_phrase_sent, cords_list))
            if len(json_imp_data_list) > 0:
              json_imp_data.append({
                f"page-{num_page}": json_imp_data_list
              })

        json_imp_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.json_output['OUT_3']}"
        with open(json_imp_data_dump_file, "w") as json_out_file:
            json.dump(json_imp_data, json_out_file)

        # create highlighted pdf file
        pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_{cfg.pdf_output['OUT_3']}"
        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)
        return pdf_output_file_name, json_imp_data_dump_file

    def highlight_code(self, page_obj, p_highlight, match_score=0):
        # highlight code if threshold is more than match threshold and common words exists
        if match_score >= self.match_threshold:
            page_highlight = page_obj.add_highlight_annot(p_highlight)
            page_highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
            page_highlight.update()
        else:
            # highlight and set color coding dont have common words
            page_highlight = page_obj.add_highlight_annot(p_highlight)
            page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
            page_highlight.update()

    def highlight_alternate_code(self, page_obj, p_code):
        for alt_code in self.get_alternate_code_pattern(p_code):
            highlight_list = page_obj.search_for(alt_code)
            for highlight in highlight_list:
                if len(highlight) > 0:
                    #print(f"alternate-code:{p_code}, Coordinate: {highlight}")
                    self.code_y2_coords_list.extend([highlight[3]])
                    # highlight pdf for option pattern
                    self.highlight_code(page_obj, highlight)

    def draw_keyword_box(self, p_page, pinst):
      rect = fitz.Rect(pinst[0] - 2, pinst[1], pinst[2] + 5, pinst[3] + 2)
      p_page.draw_rect(rect, color=(0.439, 0.160, 0.388)) # red (0.439, 0.160, 0.388)
      #p_page.insert_textbox(rect, f'{p_key_phrase}', color=(0.529, 0.807, 0.921)) # sky blue

    def draw_keyword_box2(self, p_page, p_coordinates, p_key_phrase):
      print(f"p_key_phrase: {p_key_phrase}")
      key_phrase_len = len(p_key_phrase.split())

      def draw_coord_rect(coordinate, p_page):
        x1_min = min([coord[0] for coord in coordinate])
        y1_min = min([coord[1] for coord in coordinate])

        x2_max = max([coord[2] for coord in coordinate])
        y2_max = max([coord[3] for coord in coordinate])
        print(x1_min, y1_min, x2_max, y2_max)

        rect = fitz.Rect(x1_min - 2, y1_min, x2_max + 5, y2_max + 2)
        p_page.draw_rect(rect, color=(0.439, 0.160, 0.388)) # red (0.439, 0.160, 0.388) , sky blue (0.529, 0.807, 0.921)

      # draw rec for single phrase with 1 or more occurence
      if key_phrase_len == 1:
        for coord in p_coordinates:
          print(coord)
          rect = fitz.Rect(coord[0] - 2, coord[1], coord[2] + 5, coord[3] + 2)
          p_page.draw_rect(rect, color=(0.439, 0.160, 0.388)) # red (0.439, 0.160, 0.388)
      else:
        coord_len = len(p_coordinates)
        # draw rec for multiple phrase with 1 occurence
        if coord_len == key_phrase_len:
          draw_coord_rect(p_coordinates, p_page)
        else: # draw rec for multiple phrase with more then 1 occurence
          for i in range(0, coord_len, key_phrase_len):
            coordinates = [p_coordinates[i], p_coordinates[i + 1]]
            #coordinates = p_coordinates[i: coord_len + i - key_phrase_len]
            if len(coordinates) > 0:
              draw_coord_rect(coordinates, p_page)

    def highlight_common_words(self, page_obj, common_words_coord_dict):
        highlight_coords_dict = {}
        highlighted_word_list = []
        prev_common_word = ""
        for common_word, common_word_coord in common_words_coord_dict.items():
            highlight = page_obj.add_highlight_annot(common_word_coord)
            highlight.update()
            if prev_common_word.lower() != common_word.lower():
                highlight_coords_dict[common_word] = common_word_coord
                highlighted_word_list.append(common_word)
                prev_common_word = common_word
        return highlight_coords_dict, highlighted_word_list

    def highlight_impairment(self, page_obj, imp_keywords_coord_dict):
        highlight_coords_list = []
        for keyword_impairment, imp_keywords_coords in imp_keywords_coord_dict.items():
          #print(f"keyword_impairment: {keyword_impairment}")
          cords_list = []
          for imp_keywords_coord in imp_keywords_coords:
            key_phrase_list = keyword_impairment.split()
            for key_phrase in key_phrase_list:
              if self.is_exact_match(page_obj, key_phrase, imp_keywords_coord, full_match=True, case_sensitive=True) and not self.is_coord_already_highlighted(imp_keywords_coord):
                self.draw_keyword_box(page_obj, imp_keywords_coord)
                self.highlighted_coord_list.append(imp_keywords_coord)
                cords_list.append(imp_keywords_coord)
          if len(cords_list) > 0:
            #print(f"3-key_phrase_sent: {keyword_impairment}, coordinates: {cords_list}")
            #self.draw_keyword_box(page_obj, cords_list, keyword_impairment)
            highlight_coords_list.append(self.prepare_imp_json_object(keyword_impairment, cords_list))
        return highlight_coords_list

    def get_best_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
        all_match_keyword_list = []
        all_match_imp_keyword_list = []

        # Step 1: reverse code pattern
        reversed_icd_code = reverse_code_pattern(p_code)
        # Step 2: fetch keyword based on code
        keyword = self.get_keyword(reversed_icd_code)
        # Step 3: get code paragraph
        code_paragraph_list, non_code_paragraph_list = self.get_paragraph(page_obj, code_y2_coord)
        # print(f"code_paragraph_list: {non_code_paragraph_list}")

        # Step 4: prepare code_paragraph for common words
        for code_paragraph in code_paragraph_list:
            match_keyword_dict = {}
            common_words = self.get_common_words(keyword, code_paragraph)
            if len(common_words) > 0:
                match_keyword_dict["common_words"] = common_words
                # Step 4: get best token match ratio
                clean_paragraph = " ".join(self.clean_text(code_paragraph))
                match_keyword_dict["paragraph"] = clean_paragraph
                # match_score = fuzz.token_set_ratio(keyword, clean_paragraph)
                # if fuzz.token_set_ratio(keyword, clean_paragraph) >= match_threshold else 0
                match_keyword_dict["score"] = fuzz.token_set_ratio(keyword, clean_paragraph)
                # Step 5: build common words coordinate dict
                common_words_coord_dict = {}
                print(f"code_y2_coord: {code_y2_coord}")
                for common_word in common_words:
                    highlight_list = page_obj.search_for(common_word)
                    #print(f"highlight_list: {highlight_list}")
                    found_coord = False
                    for highlight in highlight_list:
                        # get common word y2 coord value
                        common_word_y2_coords = highlight[3]
                        common_word_y1_coords = highlight[1]
                        if (code_y2_coord - 2) <= common_word_y2_coords <= (code_y2_coord + 2):
                          common_words_coord_dict[common_word] = highlight
                          found_coord = True
                        if not found_coord:
                          if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                            common_words_coord_dict[common_word] = highlight
                            # self.already_found_word_list.append(common_word)
                match_keyword_dict["common_words_coords"] = common_words_coord_dict
                all_match_keyword_list.append(match_keyword_dict)

        def get_keyword_impairment_list(p_paragraph):
            keyword_impairments = []
            for keyword_impairment in self.impairment_keyword_dict[page_num]:
                keyword_impairment_found = self.get_common_words(keyword_impairment, p_paragraph)
                if len(keyword_impairment_found) > 0:
                    keyword_impairments.append(keyword_impairment)
            return keyword_impairments

        # Step 5: prepare non code_paragraph for common words
        for non_code_paragraph in non_code_paragraph_list:
            match_imp_keyword_dict = {}
            imp_common_words_coord_dict = {}
            keyword_impairment_list = get_keyword_impairment_list(non_code_paragraph)
            for keyword_impairment in keyword_impairment_list:
                highlight_list = page_obj.search_for(keyword_impairment)
                highlight_list = [highlight for highlight in highlight_list if highlight[3] not in self.code_y2_coords_list]
                if keyword_impairment not in self.already_found_word_list and len(highlight_list) > 0:
                    self.already_found_word_list.append(keyword_impairment)
                    imp_common_words_coord_dict[f"{keyword_impairment}"] = highlight_list
                    continue
            match_imp_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
            all_match_imp_keyword_list.append(match_imp_keyword_dict)

        return all_match_keyword_list, all_match_imp_keyword_list

    def get_best_all_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
        all_match_keyword_list = []
        all_match_imp_keyword_list = []

        # Step 1: reverse code pattern
        reversed_icd_code = reverse_code_pattern(p_code)
        # Step 2: fetch keyword based on code
        keyword = self.get_keyword(reversed_icd_code)
        # Step 3: get all paragraph
        all_paragraph_list = self.get_all_paragraph(page_obj)
        # print(f"code_paragraph_list: {non_code_paragraph_list}")

        def get_keyword_impairment_list(p_paragraph):
          keyword_impairments = []
          for keyword_impairment in self.impairment_keyword_dict[page_num]:
              keyword_impairment_found = self.get_common_words(keyword_impairment, p_paragraph)
              if len(keyword_impairment_found) > 0:
                  keyword_impairments.append(keyword_impairment)
          return keyword_impairments

        # Step 4: prepare all_paragraph for common words and impairment
        for all_paragraph in all_paragraph_list:
          match_keyword_dict = {}
          common_words = self.get_common_words(keyword, all_paragraph)
          if len(common_words) > 0:
              match_keyword_dict["common_words"] = common_words
              # Step 4: get best token match ratio
              clean_paragraph = " ".join(self.clean_text(all_paragraph))
              match_keyword_dict["paragraph"] = clean_paragraph
              # match_score = fuzz.token_set_ratio(keyword, clean_paragraph)
              # if fuzz.token_set_ratio(keyword, clean_paragraph) >= match_threshold else 0
              match_keyword_dict["score"] = fuzz.token_set_ratio(keyword, clean_paragraph)
              # Step 5: build common words coordinate dict
              common_words_coord_dict = {}
              for common_word in common_words:
                  highlight_list = page_obj.search_for(common_word)
                  found_coord = False
                  for highlight in highlight_list:
                      # get common word y2 coord value
                      common_word_y2_coords = highlight[3]
                      if (code_y2_coord - 2) <= common_word_y2_coords <= (code_y2_coord + 2):
                        common_words_coord_dict[common_word] = highlight
                        found_coord = True
                      if not found_coord:
                        if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                          common_words_coord_dict[common_word] = highlight
                          # self.already_found_word_list.append(common_word)
              match_keyword_dict["common_words_coords"] = common_words_coord_dict
              all_match_keyword_list.append(match_keyword_dict)
          else:
            # Step 5: prepare non code_paragraph for common words
            match_imp_keyword_dict = {}
            imp_common_words_coord_dict = {}
            keyword_impairment_list = get_keyword_impairment_list(all_paragraph)
            for keyword_impairment in keyword_impairment_list:
              # print(f"keyword_impairment11: {keyword_impairment}")
              highlight_list = page_obj.search_for(keyword_impairment)
              if keyword_impairment not in self.already_found_word_list and len(highlight_list) > 0:
                self.already_found_word_list.append(keyword_impairment)
                imp_common_words_coord_dict[f"{keyword_impairment}"] = highlight_list
                # print(f"keyword_impairment22: {keyword_impairment}, coords: {highlight_list}")
                continue
            match_imp_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
            all_match_imp_keyword_list.append(match_imp_keyword_dict)

        return all_match_keyword_list, all_match_imp_keyword_list

    def prepare_code_json_object(self, num_page, code, highlight):
        # prepare json object
        json_data_object = {}
        actual_code = reverse_code_pattern(code)
        json_data_object[f"page-{num_page}"] = {
          "code_type": "ICD-10",
          "actual_code": actual_code,
          "code_found": code,
          "code_coord": f"{highlight}"
        }
        return json_data_object

    def prepare_json_object(self, num_page, code, highlight, keyword, match_keyword_dict=None, highlight_coords_dict=None):
        # prepare json object
        json_data_object = {}
        actual_code = reverse_code_pattern(code)
        json_data_object[f"page-{num_page}"] = {
          "code_type": "ICD-10",
          "actual_code": actual_code,
          "code_found": code,
          "code_coord": f"{highlight}",
          "code_desc": keyword if keyword else "UNVALIDATED",
          "common_keywords": list(set([w.lower() for w in match_keyword_dict["common_words"]])) if match_keyword_dict is not None else 'UNVALIDATED',
          "keyword_coords": f"{highlight_coords_dict if highlight_coords_dict is not None else ''}",
          "keyword_found": True if match_keyword_dict is not None else False,
          "match_confidence": match_keyword_dict['score'] if match_keyword_dict is not None else 0.0,
          "synid": self.get_code_synid(actual_code),
          "field_target": self.get_field_target(actual_code)
        }
        return json_data_object

    def prepare_imp_json_object(self, key_phrase_sent, cords_list):
      json_data_object = {
        "keywords": key_phrase_sent,
        "coordinate": f"{cords_list}",
        "synid": self.get_synid(key_phrase_sent)
      }
      return json_data_object

    def is_coord_equal(self, highlight1, highlight2):
      if highlight1[0] == highlight2[0] and highlight1[1] == highlight2[1] and highlight1[2] == highlight2[2] and highlight1[3] == highlight2[3]:
        return True
      else:
        return False

    def is_coord_already_highlighted(self, highlight2):
      is_coord_already_highlighted = False
      for highlight1 in self.highlighted_coord_list:
        if highlight1[0] == highlight2[0] and highlight1[1] == highlight2[1] and highlight1[2] == highlight2[2] and highlight1[3] == highlight2[3]:
          is_coord_already_highlighted = True
          break
      return is_coord_already_highlighted

    def search_icd_code(self, txt_list):
      pdf_page_vocab = {}
      for txt_file in txt_list:
        with open(txt_file, "r") as f:
          page_txt = f.read()
          # check the page that have line number instead of code
          index_page = False
          if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
            index_page = True

          doc = self.nlp_code10(page_txt)
          code_list = []
          if index_page:
            # check the code contain letter "L"
            code_list = [ent.text for ent in doc.ents if not re.search("(L[0-9]+)", ent.text)]
          else:
            code_list = [ent.text for ent in doc.ents]

          if len(code_list) != 0:
            page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
            pdf_page_vocab[page_number] = list(set(code_list))
            # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
      return pdf_page_vocab

    def get_keyword(self, p_code):
        keyword = ""
        # reverse code if required
        code = reverse_code_pattern(p_code)
        # get keyword from dataset
        keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
        if len(keyword_list) > 0:
            keyword = keyword_list[0]
        return keyword

    def get_code_synid(self, code):
      synid = "No_SynId"
      # get synid from dataset
      synid_list = list(self.code_df.loc[self.code_df["Code"] == code]["SynId"])
      if len(synid_list) > 0:
        synid = "No_SynId" if synid_list[0] is None else synid_list[0]
      return synid

    def get_field_target(self, code):
      field_target = "No_Field_Target"
      # get synid from dataset
      field_target_list = list(self.code_df.loc[self.code_df["Code"] == code]["Field_Target"])
      if len(field_target_list) > 0:
        field_target = "No_Field_Target" if field_target_list[0] is None else field_target_list[0]
      return field_target

    def get_synid(self, keyword_impairment):
      synid = "No_SynId"
      # get synid from dataset
      synid_list = list(self.synid_df.loc[self.synid_df["Short_Description"].str.lower() == keyword_impairment.lower()]["SynId"])
      if len(synid_list) > 0:
        synid = synid_list[0]
      return synid

    def get_paragraph(self, page_obj, code_y2_coord):
        code_paragraph_list = []
        non_code_paragraph_list = []
        page_content = page_obj.get_text("blocks", sort=False)
        for content in page_content:
            # print(f"content found: {content[4]}")
            if content[1] <= code_y2_coord <= content[3]:
                if len(self.clean_text(content[4])) > 0:
                    code_paragraph = content[4]
                    code_paragraph_list.append(code_paragraph)
            else:
                if len(self.clean_text(content[4])) > 0:
                    non_code_paragraph_list.append(content[4])
        return code_paragraph_list, non_code_paragraph_list

    def get_all_paragraph(self, page_obj):
        code_paragraph_list = []
        page_content = page_obj.get_text("text", sort=False)
        # for content in page_content:
        if len(self.clean_text(page_content)) > 0:
          code_paragraph_list.append(page_content)
        return code_paragraph_list

    def get_common_words(self, sent1, sent2):
        clean_token1 = self.clean_text(sent1)
        clean_token2 = self.clean_text(sent2)
        token_set1 = set(clean_token1)
        token_set2 = set(clean_token2)

        common_word_set = set()

        def get_common(token_set1, token_set2):
            for w1 in token_set1:
                for w2 in token_set2:
                    if w1.lower() == w2.lower():
                        common_word_set.add(w1)

        get_common(token_set1, token_set2)
        get_common(token_set2, token_set1)
        return list(common_word_set)

    def clean_text(self, sent):
        # tokenize sentence
        sent1 = word_tokenize(sent)
        # filter stop words
        filtered_sent = [w for w in sent1 if not w.lower() in self.stop_words]
        filtered_sent = [w for w in filtered_sent if re.sub(re.compile('\W'), '', w)]
        clean_tokens = []
        for token in filtered_sent:
            if token.find("-"):
                tokens = token.split("-")
                clean_tokens.extend(tokens)
            else:
                clean_tokens.append(token)
        return clean_tokens

    def is_exact_match(self, page, term, clip, full_match=False, case_sensitive=False):
      # clip is an item from page.search_for(term, quads=True)
      termLen = len(term)
      termBboxLen = max(clip.height, clip.width)
      termfontSize = termBboxLen/termLen
      f = termfontSize * 2

      validate = page.get_text("blocks", clip = clip + (-f, -f, f, f), flags=0)[0][4]
      flag = 0
      if not case_sensitive:
          flag = re.IGNORECASE

      matches = len(re.findall(f'{term}', validate, flags=flag)) > 0
      if full_match:
          matches = len(re.findall(f'\\b{term}\\b', validate))>0
      return matches

    def is_exact_match2(self, page, term, clip):
        # clip is an item from page.search_for(term, quads=True)
        termLen = len(term)
        termBboxLen = max(clip.height, clip.width)
        termfontSize = termBboxLen / termLen
        f = termfontSize * 2
        # clip = clip.rect
        text_block = page.get_text("blocks", clip=clip + (-f, -f, f, f), flags=0)[0][4]
        # re.sub(r"[^a-zA-Z\d\s:]", "", "(HIV]")
        # if re.sub(r"[^a-zA-Z\d\s:]", "", text_block.strip()) in [t for t in term.split()]:
        if re.sub(r"[^a-zA-Z\d\s:]", "", text_block.lower().strip()) in [t.lower() for t in term.split()]:
            return True
        else:
            return False

    def get_alternate_code_pattern(self, p_code):
        # create alternate pattern
        code_patterns = []
        code_arr = p_code.split(".")
        if len(code_arr) > 1:
            code1 = f"{code_arr[0]}. {code_arr[1]}"
            code2 = f"{code_arr[0]} .{code_arr[1]}"
            code3 = f"{code_arr[0]} . {code_arr[1]}"
            code4 = f"{code_arr[0]} {code_arr[1]}"
            code44 = f"{code_arr[0]},{code_arr[1]}"
            code45 = f"{code_arr[0]}, {code_arr[1]}"
            code46 = f"{code_arr[0]} ,{code_arr[1]}"
            code47 = f"{code_arr[0]} , {code_arr[1]}"
            code_patterns.extend([code1, code2, code3, code4, code44, code45, code46, code47])
            # handle if the first char of code is missing
            alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
            for key, val in alphabats.items():
                if p_code.startswith(key):
                    code5 = p_code.replace(key, val)
                    code_patterns.extend([code5])
                # replcae char on 1 index if it is not present in icd9 code dataset
                if p_code.find(val) == 1:
                    code6 = replacer(p_code, key, 1)
                    code_patterns.extend([code6])
                    # replcae char on 2 index
                    if p_code.find(val) == 2:
                        code7 = replacer(code6, key, 2)
                        code_patterns.extend([code7])
        return code_patterns


def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


def reverse_code_pattern(p_code):
    orig_code = p_code

    # check for code contains space(" ")
    tmp_code = orig_code.split(" ")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains dot(".")
    tmp_code = p_code.split(".")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains comma(",")
    tmp_code = p_code.split(",")
    if len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
    elif len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

    # handle if the first char of code is missing
    alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
    for key, val in alphabats.items():
        # replcae char on 0 index
        if orig_code.find(val) == 0:
            # orig_code = orig_code.replace(val, key)
            orig_code = replacer(orig_code, key, 0)
        # replcae char on 1 index
        if orig_code.find(key) == 1:
            orig_code = replacer(orig_code, val, 1)
            # replcae char on 2 index
            if orig_code.find(key) == 2:
                orig_code = replacer(orig_code, val, 2)
            break

    return orig_code


def replacer(s, newstring, index, nofail=False):
    # raise an error if index is outside of the string
    if not nofail and index not in range(len(s)):
        raise ValueError("index outside given string")

    # if not erroring, but the index is still not in the correct range..
    if index < 0:  # add it to the beginning
        return newstring + s
    if index > len(s):  # add it to the end
        return s + newstring

    # insert the new string between "slices" of the original
    return s[:index] + newstring + s[index + 1:]


def create_directory(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

##Sentence Extractor

In [3]:
class SentenceExtractor:
  def __init__(self):
    self.MAX_WORKERS = 20
    self.keyword_matcher = KeywordMatcher()

  def get_json_array_list(self, text_path):
    json_arr = None
    try:
      # print(f"Running '{text_path}'")
      json_arr = self.keyword_matcher.call(text_path)
      #print(f"Got json for '{json_arr}'")
    except Exception as err:
      print(f"Error for file[{text_path}] is:\n{err}")
    return json_arr

  def get_wrong_keyword_dict(self, text_files_list, with_thread=False, with_process=False):
      def get_sorted_dict(p_json_arr_list):
          wrong_keyword_dict = {
              idx: set([list(element.values())[0] for element in json_arr if json_arr])
              for idx, json_arr in enumerate(p_json_arr_list)
          }
          return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

      if with_thread:
          # take care so that unnecessary thread should not be created
          workers = min(self.MAX_WORKERS, len(text_files_list))
          with futures.ThreadPoolExecutor(max_workers=workers) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      if with_process:
          with futures.ProcessPoolExecutor(max_workers=4) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      else:
          json_arr_list = list(map(self.get_json_array_list, text_files_list))
          tmp_wrong_keyword_dict = {
              idx: set([list(element.keys())[0] for element in json_arr if json_arr is not None])
              for idx, json_arr in enumerate(json_arr_list)
          }
          return tmp_wrong_keyword_dict

  def extract_sentence(self, wrong_keyword_list, sample_text_list):
      match_keyword_dict = {}
      for key, keyword_set in wrong_keyword_list.items():
          match_dicts = {}
          for key_phrase in keyword_set:
              # print(key, key_phrase)
              with open(sample_text_list[key], "r") as f:
                  file_txt = f.read()
              # match_list = re.findall(f"([^\n]*?(?i){key_phrase}[^.]*\.)", file_txt)
              match_list = re.findall(f"([^\n]*{key_phrase}[^\n]*\n)", file_txt)
              if match_list:
                  match_dicts[key_phrase] = [match.replace("\n", "") for match in match_list]
          match_keyword_dict[key] = match_dicts
      return match_keyword_dict

##Utility

In [4]:
def get_fifth_value(row):
    return row[5]


def get_second_value(row):
    return row[2]

def check_append(out_dict, item):
    key, x1, y1, x2, y2, *rest = item
    if key not in out_dict:
        out_dict[key] = [fitz.Rect(x1, y1, x2, y2)]
    else:
        out_dict[key].append(fitz.Rect(x1, y1, x2, y2))


def consolidate_and_change_out_format(final_out):
    out_dict = dict()
    for items in final_out:
        if not isinstance(items, (list, array)):
            check_append(out_dict, items)
        else:
            for item in items:
                check_append(out_dict, item)

    return out_dict


def remove_duplicates(sorted_array):
    data = {}
    final_output = []
    for row in sorted_array:
        if row[1] not in data:
            data[row[1]] = [row]
            # If the x1 is already in the dictionary, append the row to the existing value
        else:
            data[row[1]].append(row)

    # print (*sorted_rows,sep='\n')
    # Iterate over the dictionary and print any keys (x1) that have more than one value (rows)
    for key, value in data.items():
        if len(value) > 1:
            y_values = sorted(value, key=get_second_value)
            if len(y_values) > 1:
                sorted_value = y_values
                for i in range(len(y_values)):
                    for j in range(i + 1, len(y_values)):
                        if y_values[i][2] != y_values[j][2]:
                            final_output.append(y_values)
                            # print(y_values)
                        elif y_values[i][2] == y_values[j][2]:
                            sorted_value = sorted(value, key=get_fifth_value)
                last_element = sorted_value[-1]
                final_output.append(last_element)
                            # print ('[',last_element,']')
        else:
            # print (value)
            final_output.append(value)
    return consolidate_and_change_out_format(final_output)


def filter_duplicate_coordinate(code_dict):
    # sample_text = None
    sorted_array = []
    # with open(r"33.dict") as f:
    #     sample_text = eval(f.read())

    for key in code_dict:
        for highlight in code_dict[key]:
            diffXvals = float(highlight[2]) - float(highlight[0])
            # print(key,",",rect.x1,",",rect.y1,",",rect.x2,",",rect.y2,",",diffXvals)
            str = key, highlight[0], highlight[1], highlight[2], highlight[3], diffXvals
            sorted_array.append(str)
            # print (sorted_array)

    return remove_duplicates(sorted_array)

##Keyword Matching & Highlighting

In [5]:
!rm -rf input_files
!mkdir -p input_files

In [6]:
!rm -rf outputs
!mkdir -p outputs
#!rm -rf output*.zip

In [7]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [8]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"

highlighter = Highlighter(match_threshold=35)
sent_extractor = SentenceExtractor()

```log
CPU times: user 1min 7s, sys: 6.54 s, total: 1min 14s
Wall time: 1min 15s
```

In [9]:
output_type = 5

In [10]:
%%time
wrong_keyword_dict1 = None
icd10_code_dict1 = None
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"

  # Step-1: splitting pdf file
  print("Step-1: Splitting pdf file............")
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  print("Step-2: Extracting text from pdf............")
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  print("Step-3: Searching ICD-10 code into text file..........")
  icd10_code_dict = highlighter.search_icd_code(txt_list)
  icd10_code_dict1 = icd10_code_dict

  # Step-4: Get closet match of ICD-10 keyword
  print("Step-4: Get closet match of ICD-10 keyword..........")
  wrong_keyword_dict = sent_extractor.get_wrong_keyword_dict(txt_list)
  wrong_keyword_dict1 = wrong_keyword_dict
  highlighter.set_impairment_keyword_dict(wrong_keyword_dict)

  if output_type == 1:
    # Step-4: Highlighting ICD-10 code, its description and keyword impairment into PDF file
    print("Step-5: Highlighting ICD-10 code, its description and keyword impairment into PDF file............")
    pdf_output_file, json_code_output_file = highlighter.highlight_all(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  elif output_type == 2:
    # Step-5: Highlighting ICD-10 code and its description into pdf
    print("Step-5: Highlighting ICD-10 code and its description into pdf............")
    pdf_output_file, json_output_file = highlighter.highlight_icd_code_and_common_words(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted coordinates are saved into [{json_output_file}] file.")
  elif output_type == 3:
    # Step-6: Highlighting keyword impairment into pdf
    print("Step-5: Highlighting keyword impairment into pdf............")
    pdf_output_file, json_output_file = highlighter.highlight_keyword_impairment(pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting keyword impairment.")
    print(f"Highlighted coordinates are saved into [{json_output_file}] file.")
  elif output_type == 4:
    # Step-7: Highlighting ICD-10 code, its description and all keyword impairment into pdf
    print("Step-5: Highlighting ICD-10 code, its description and all keyword impairment into pdf............")
    pdf_output_file, json_code_output_file = highlighter.highlight_all(icd10_code_dict, pdf_file_name=pdf_file_name, highlight_all=True)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  elif output_type == 5:
    # Step-8: Highlighting ICD-10 code and all keyword impairment into pdf
    print("Step-5: Highlighting ICD-10 code and all keyword impairment into pdf............")
    pdf_output_file, json_code_output_file = highlighter.highlight_icd_code_and_keyword_impairment(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and all keyword impairment ")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  else:
    print("Please pass value 1, 2 and 3 for output type.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

Step-1: Splitting pdf file............
Step-2: Extracting text from pdf............
Step-3: Searching ICD-10 code into text file..........
Step-4: Get closet match of ICD-10 keyword..........
Step-5: Highlighting ICD-10 code and all keyword impairment into pdf............
File[outputs/RP10503341_20230418090748_X_Redacted_codes_keywords.pdf] is saved after highlighting ICD-10 code and all keyword impairment 
Highlighted code and impairment coordinates are saved into [outputs/RP10503341_20230418090748_X_Redacted_codes_keywords.json] file.
CPU times: user 1min 7s, sys: 783 ms, total: 1min 8s
Wall time: 1min 10s


In [None]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [11]:
!zip output5.zip outputs/*.*

  adding: outputs/RP10503341_20230418090748_X_Redacted_codes_keywords.json (deflated 81%)
  adding: outputs/RP10503341_20230418090748_X_Redacted_codes_keywords.pdf (deflated 2%)


In [None]:
icd10_code_dict1

In [None]:
wrong_keyword_dict1[6]

In [None]:
wrong_keyword_dict1[60]

In [None]:
wrong_keyword_dict1[0]

##ICD 10 Code

In [None]:
!mkdir -p pdf-files
!mkdir -p txt-files

In [None]:
# define directory path after creating it
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

# create nlp instance
nlp = English()


def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfReader(pdf_in_file)
  pdf_list = []
  for page in range(len(pdf.pages)):
      inputpdf = PdfReader(pdf_in_file)
      output = PdfWriter()
      output.add_page(inputpdf.pages[page])
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
        with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
            pdf = pdftotext.PDF(f)

        # Read all the text into one string
        pdf_text = "\n\n".join(pdf)

        # write text into file
        with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
            f.write(pdf_text)
        txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
        i += 1
    return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def highlight_icd10_code(pdf_page_dict: dict, pdf_file_name: str):
    pdf_file = fitz.open(pdf_file_name)

    def highlight_pdf(highlight):
        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.update()
          highlight = page.search_for(text_to_be_highlighted)
          print(f"Page-{page_num}: ", code, highlight, end='\n')

    for page_num, page in enumerate(pdf_file):
        if page_num in pdf_page_dict:
          for code in pdf_page_dict[page_num]:
            text_to_be_highlighted = code
            highlight = page.search_for(text_to_be_highlighted)
            print(f"Page-{page_num}: ", code, highlight, end='\n')
            if len(highlight) == 0:
                alternate_code_list = get_opt_pattern(code)
                for alt_code in alternate_code_list:
                  text_to_be_highlighted = alt_code
                  highlight = page.search_for(text_to_be_highlighted)
                  # highlight pdf for option pattern
                  highlight_pdf(highlight)
            # highlight pdf for main pattern
            highlight_pdf(highlight)

    output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
    pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)
    return output_pdf_file_name


def search_icd_10_code(txt_list):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          #print(txt_file)
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [None]:
# Step-1: splitting pdf file
pdf_file_name = "28page.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v1.jsonl")

# Step-4: Searching ICD-10 code
#print (txt_list)
pdf_page_vocab = search_icd_10_code(txt_list)

# Step-5: Highlighting ICD-10 code into pdf
output_file_name = highlight_icd10_code(pdf_page_vocab, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code")

##Phrase matching

In [None]:
import spacy
from spacy.matcher import PhraseMatcher

In [None]:
synid_df = pd.read_csv(cfg.csv_files["IMP_CSV"])
synid_df.head()

Unnamed: 0,SynId,Short_Description
0,KW10356,0157:H7
1,KW12576,17q11.2 mutation
2,KW9796,21-Hydroxylase Deficiency
3,KW12084,3-Day Measles
4,KW11051,47 XXY Chromosome Defect


In [None]:
def make_icd_10_keyword_pattern(synid_df):
  patterns = []
  for _, row in synid_df.iterrows():
    patterns.append(row["Short_Description"])
  return patterns

In [None]:
keywords = make_icd_10_keyword_pattern(synid_df)
keywords[:10]

['0157:H7',
 '17q11.2 mutation',
 '21-Hydroxylase Deficiency',
 '3-Day Measles',
 '47 XXY Chromosome Defect',
 '5-Aminolevulinic Acid Dehydratase Porphyria',
 'A. cantonensis',
 'AA',
 'Aaortic Incompetence',
 'Aarskog Syndrome']

In [None]:
phrase_matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)

In [None]:
with open("page-23.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['MD',
 'Hypertriglyceridemia',
 'Hypertension',
 'Atrial Fibrillation',
 'MD',
 'Headache',
 'MD']

In [None]:
with open("page-25.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Headache']

In [None]:
with open("page-36.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Diplopia',
 'Headache',
 'Visual Loss',
 'Gynecomastia',
 'Dysphagia',
 'Hematuria']

In [None]:
synid_df = pd.read_csv("synid_and_keywords_impairment.csv")
keywords = make_icd_10_keyword_pattern(synid_df)

In [None]:
phrase_matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)

In [None]:
with open("page-36.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Diplopia',
 'Headache',
 'Visual Loss',
 'Gynecomastia',
 'Dysphagia',
 'Hematuria',
 'Headaches']