<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/icd10-code-highlighting/13_3_icd_10_9_code_and_keyword_impairment_highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

#!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install more-itertools
!pip install PyPDF2
#!pip install fitz
!pip install pymupdf==1.22.0
!pip install fuzzywuzzy
#!pip install pikepdf

In [None]:
import fitz
print(fitz.__doc__)


PyMuPDF 1.22.0: Python bindings for the MuPDF 1.22.0 library.
Version date: 2023-04-14 00:00:01.
Built for Python 3.10 on linux (64-bit).



In [None]:
#!pip install pikepdf
#!apt install ocrmypdf

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!python --version

Python 3.10.12


In [None]:
!rm -rf data
!unzip data.zip

Archive:  data.zip
   creating: data/csv_files/
  inflating: data/csv_files/icd_10_code_and_keywords_v2.csv  
  inflating: data/csv_files/icd_10_code_and_keywords_v3.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v1.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v2.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v3.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v4.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v5.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v6.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v7.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v8.csv  
  inflating: data/csv_files/synid_and_keywords_impairment_v9.csv  
   creating: data/pattern_files/
  inflating: data/pattern_files/date_regex-v1.json  
  inflating: data/pattern_files/generic_matcher_config-v0.json  
  inflating: data/pattern_files/icd10_code_patterns-v6.jsonl  
   creating

##Import

In [None]:
import re
import os
import shutil
import glob
import json
import logging
import more_itertools
from pathlib import Path

import numpy as np
import pandas as pd

import fitz

from spacy.lang.en import English

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from dataclasses import dataclass
from array import *

from concurrent import futures
from keyword_matcher import KeywordMatcher
from sentence_extractor import SentenceExtractor
from generic_matcher import GenericMatcher
import config as cfg
from utils import *



##Core Classes

In [None]:
(238/255, 210/255, 2/255)

(0.9333333333333333, 0.8235294117647058, 0.00784313725490196)

In [None]:
class Highlighter:
    def __init__(self, match_threshold=30):
        # loading and updating patterns for ICD-10 code
        self.nlp_code10 = English()
        self.nlp_code10.add_pipe("entity_ruler").from_disk(cfg.pattern_files["jsonl"])

        self.match_threshold = match_threshold
        self.text_list = None

        # define icd-10 code dataset
        core_df = pd.read_csv(cfg.csv_files["CODE_CSV"])
        self.code_df = core_df.where(pd.notnull(core_df), None)
        self.synid_df = pd.read_csv(cfg.csv_files["IMP_CSV"])

        # define required directory path
        self.PDF_FILES_PATH = cfg.file_path["TMP_PDF_FILES_PATH"]
        self.TXT_FILES_PATH = cfg.file_path["TMP_TXT_FILES_PATH"]
        self.OUTPUT_FILES_PATH = cfg.file_path["OUTPUT_PATH"]

        self.logger = logging.getLogger(__name__)

    def set_impairment_keyword_dict(self, imp_keyword_dict):
        self.impairment_keyword_dict = imp_keyword_dict

    def set_page_matched_date_dict(self, matched_date_dict):
        self.page_matched_date_dict = matched_date_dict

    def set_match_threshold(self, match_val):
        self.match_threshold = match_val

    def set_text_list(self, p_text_list):
        self.text_list = p_text_list

    def highlight_all(self, icd10_code_dict, pdf_file_name=None, highlight_all=False):
        pdf_file = fitz.open(pdf_file_name)
        json_data = []
        json_imp_data = []
        json_date_data = []

        for page_num, page in enumerate(pdf_file):
            already_done_page_list = []
            self.code_y2_coords_list = []
            self.already_found_word_list = []
            self.highlighted_coord_list = []
            code_highlight_dict = {}

            # highlight ICD-10 code
            if page_num in icd10_code_dict:
                num_page = page_num + 1

                # let's store code y2 coords
                for code in icd10_code_dict[page_num]:
                    highlight_list = page.search_for(code)
                    #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                    if len(highlight_list) > 0:
                        code_highlight_dict[code] = highlight_list
                        self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                    else:
                        for alt_code in get_alternate_code_pattern(code):
                            highlight_list = page.search_for(alt_code)
                            for highlight in highlight_list:
                                if len(highlight) > 0:
                                    code_highlight_dict[code] = [highlight]
                                    # print(f"alternate-code:{code}, Coordinate: {highlight}")
                                    self.code_y2_coords_list.extend([highlight[3]])
                                    # print(f"[highlight[3]]: {[highlight[3]]}")

                filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
                # now, let highlight every code and its common words
                for code, highlight_list in filtered_code_highlight_dict.items():
                    prev_highlight = fitz.Rect(0, 0, 0, 0)
                    for highlight in highlight_list:
                        keyword = get_keyword(code, self.code_df)
                        # get match score and common words coordinate
                        if not highlight_all:
                          all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num,
                                                                                                        code,
                                                                                                        page,
                                                                                                        highlight[3],
                                                                                                        self.match_threshold)
                        else:
                          all_match_keyword_list, all_match_imp_keyword_list = self.get_best_all_token_match(page_num,
                                                                                                        code,
                                                                                                        page,
                                                                                                        highlight[3],
                                                                                                        self.match_threshold)

                        # Step-1: highlight and set color coding dont have common words
                        if not all_match_keyword_list:
                            highlight_code(page, highlight)
                            # prepare json object
                            json_data.append(prepare_json_object(num_page, code, self.code_df, highlight, keyword))

                        # Step-2: highlight and set color coding that have common words
                        for match_keyword_dict in all_match_keyword_list:
                            # highlight ICD-10 code
                            if not is_coord_equal(prev_highlight, highlight):
                              highlight_code(page, highlight)
                              prev_highlight = highlight
                            # highlight common words
                            highlight_coords_dict = {}
                            highlight_coords_dict, highlighted_word_list = self.highlight_common_words(page, match_keyword_dict["common_words_coords"])
                            # prepare json object
                            json_data.append(prepare_json_object(num_page, code, self.code_df, highlight, keyword, match_keyword_dict, highlight_coords_dict))

                        # Step-3: highlight and set color coding for impairment keyword common words
                        json_imp_data_list = []
                        for match_imp_keyword_dict in all_match_imp_keyword_list:
                            json_imp_data_object = {}
                            # highlight common words
                            highlight_coords_dict = {}
                            if "imp_common_keyword_coords" in match_imp_keyword_dict:
                              if match_imp_keyword_dict["imp_common_keyword_coords"]:
                                highlight_coords_list = self.highlight_impairment(page, num_page, match_imp_keyword_dict["imp_common_keyword_coords"])
                                already_done_page_list.append(page_num)
                                # prepare json object
                                if highlight_coords_list:
                                  json_imp_data_list.extend(highlight_coords_list)
                        if len(json_imp_data_list) > 0:
                            json_imp_data.extend(json_imp_data_list)

            # highlight impairment key phrase
            if page_num in self.impairment_keyword_dict and page_num not in already_done_page_list:
              json_imp_data_list = self.highlight_keyword_impairment_data(page, page_num)
              if len(json_imp_data_list) > 0:
                json_imp_data.extend(json_imp_data_list)

            # highlight date and time in PDF
            if page_num in self.page_matched_date_dict:
                json_date_data_list = self.highlight_date_and_time(page, page_num)
                if len(json_date_data_list) > 0:
                  json_date_data.extend(json_date_data_list)

        # save PDF and JSON output file
        pdf_output_file_name, json_data_dump_file = save_output_file(pdf_object=pdf_file, cfg=cfg,
                                                                     output_type=2 if highlight_all else 1,
                                                                     output_path=self.OUTPUT_FILES_PATH,
                                                                     output_file_name=pdf_file_name,
                                                                     json_data=json_data,
                                                                     json_imp_data=json_imp_data,
                                                                     json_date_data=json_date_data)
        return pdf_output_file_name, json_data_dump_file

    def highlight_icd_code_and_common_words(self, icd10_code_dict, pdf_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        json_data = []

        for page_num, page in enumerate(pdf_file):
            self.code_y2_coords_list = []
            self.already_found_word_list = []
            code_highlight_dict = {}

            # highlight ICD-10 code
            if page_num in icd10_code_dict:
                num_page = page_num + 1
                # let's store code y2 coords
                for code in icd10_code_dict[page_num]:
                    highlight_list = page.search_for(code)
                    #print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                    if len(highlight_list) > 0:
                        code_highlight_dict[code] = highlight_list
                        self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                    else:
                        for alt_code in get_alternate_code_pattern(code):
                            highlight_list = page.search_for(alt_code)
                            for highlight in highlight_list:
                                if len(highlight) > 0:
                                    code_highlight_dict[code] = [highlight]
                                    #print(f"alternate-code:{code}, Coordinate: {highlight}")
                                    self.code_y2_coords_list.extend([highlight[3]])

                filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
                # now, let highlight every code and its common words
                for code, highlight_list in filtered_code_highlight_dict.items():
                    prev_highlight = fitz.Rect(0, 0, 0, 0)
                    for highlight in highlight_list:
                        keyword = get_keyword(code, self.code_df)
                        # get match score and common words coordinate
                        all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num, code,
                                                                                                       page,
                                                                                                       highlight[3],
                                                                                                       self.match_threshold)

                        # Step-1: highlight and set color coding dont have common words
                        if not all_match_keyword_list:
                            page_highlight = page.add_highlight_annot(highlight)
                            page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                            page_highlight.update()
                            # prepare json object
                            json_data.append(prepare_json_object(num_page, code, self.code_df, highlight, keyword))

                        # Step-2: highlight and set color coding that have common words
                        for match_keyword_dict in all_match_keyword_list:
                            # highlight ICD-10 code
                            if not self.is_coord_equal(prev_highlight, highlight):
                              highlight_code(page, highlight)
                              prev_highlight = highlight
                            # highlight common words
                            highlight_coords_dict = {}
                            highlight_coords_dict, highlighted_word_list = self.highlight_common_words(page, match_keyword_dict["common_words_coords"])

                            # prepare json object
                            json_data.append(prepare_json_object(num_page, code, self.code_df, highlight, keyword, match_keyword_dict, highlight_coords_dict))

        # save PDF and JSON output file
        pdf_output_file_name, json_data_dump_file = save_output_file(pdf_file, cfg, 3, self.OUTPUT_FILES_PATH, pdf_file_name, json_data)
        return pdf_output_file_name, json_data_dump_file

    def highlight_icd_code_and_keyword_impairment(self, icd10_code_dict, pdf_file_name=None):
      pdf_file = fitz.open(pdf_file_name)
      json_data = []
      json_imp_data = []
      json_date_data = []

      for page_num, page in enumerate(pdf_file):
        already_done_page_list = []
        self.code_y2_coords_list = []
        self.already_found_word_list = []
        self.highlighted_coord_list = []
        self.highlighted_term_list = []
        code_highlight_dict = {}

        # highlight ICD-10 code
        if page_num in icd10_code_dict:
            num_page = page_num + 1

            # let's store code y2 coords
            for code in icd10_code_dict[page_num]:
                highlight_list = page.search_for(code)
                # print(f"Page-{num_page}-{code}, Coordinate: {highlight_list}")
                if len(highlight_list) > 0:
                    code_highlight_dict[code] = highlight_list
                    self.code_y2_coords_list.extend([highlight[3] for highlight in highlight_list])
                else:
                    for alt_code in get_alternate_code_pattern(code):
                        highlight_list = page.search_for(alt_code)
                        for highlight in highlight_list:
                            if len(highlight) > 0:
                                code_highlight_dict[code] = [highlight]
                                # print(f"alternate-code:{code}, Coordinate: {highlight}")
                                self.code_y2_coords_list.extend([highlight[3]])
                                # print(f"[highlight[3]]: {[highlight[3]]}")
            # print(f"code_highlight_dict: {code_highlight_dict}")
            filtered_code_highlight_dict = filter_duplicate_coordinate(code_highlight_dict)
            # now, let highlight every code and its common words
            for code, highlight_list in filtered_code_highlight_dict.items():
                # print(f"code-{code}, Coordinate: {highlight_list}")
                prev_highlight = fitz.Rect(0, 0, 0, 0)
                for highlight in highlight_list:
                    # Step-1: highlight ICD-10 code
                    #print(f"prev_highlight: {prev_highlight}")
                    if not is_coord_equal(prev_highlight, highlight):
                      #print(f"highlight: {highlight}")
                      highlight_code(page, highlight)
                      prev_highlight = highlight
                    # prepare json object
                    json_data.append(prepare_code_json_object(num_page, code, highlight))

        # highlight ICD key phrase
        # if page_num in self.impairment_keyword_dict and page_num not in already_done_page_list:
        if page_num in self.impairment_keyword_dict:
          json_imp_data_list = self.highlight_keyword_impairment_data(page, page_num)
          if len(json_imp_data_list) > 0:
            json_imp_data.extend(json_imp_data_list)

        # highlight date and time in PDF
        if page_num in self.page_matched_date_dict:
          json_date_data_list = self.highlight_date_and_time(page, page_num)
          if len(json_date_data_list) > 0:
            json_date_data.extend(json_date_data_list)

      # save PDF and JSON output file
      pdf_output_file_name, json_data_dump_file = save_output_file(pdf_object=pdf_file, cfg=cfg,
                                                                   output_type=4,
                                                                   output_path=self.OUTPUT_FILES_PATH,
                                                                   output_file_name=pdf_file_name,
                                                                   json_data=json_data,
                                                                   json_imp_data=json_imp_data,
                                                                   json_date_data=json_date_data)
      return pdf_output_file_name, json_data_dump_file

    def highlight_keyword_impairment(self, pdf_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        json_imp_data = []
        for page_num, page in enumerate(pdf_file):
          self.highlighted_coord_list = []
          # highlight ICD key phrase
          if page_num in self.impairment_keyword_dict:
            json_imp_data_list = []
            num_page = page_num + 1
            imp_keywords_coord_dict = {}
            key_phrase_sents = self.impairment_keyword_dict[page_num]
            for key_phrase_sent in key_phrase_sents:
              #print(f"key_phrase_sent: {key_phrase_sent}")
              coordinates = page.search_for(key_phrase_sent)
              imp_keywords_coord_dict[key_phrase_sent] = coordinates
            highlight_coords_list = self.highlight_impairment(page, num_page, imp_keywords_coord_dict)
            if len(highlight_coords_list) > 0:
              json_imp_data_list.extend(highlight_coords_list)
            if len(json_imp_data_list) > 0:
              json_imp_data.extend(json_imp_data_list)

        # save PDF and JSON output file
        pdf_output_file_name, json_data_dump_file = save_output_file(pdf_file, cfg, 5, self.OUTPUT_FILES_PATH, pdf_file_name, json_imp_data=json_imp_data)
        return pdf_output_file_name, json_data_dump_file

    def highlight_keyword_impairment_data(self, p_page, page_num):
      json_imp_data_list = []
      num_page = page_num + 1
      # print(f"num_page: {num_page}")
      imp_keywords_coord_dict = {}

      def highlight_multi_key_phrase(p_keyword_impairment, batch_coord):
        phrase_coords = []
        filter_coords = []
        try:
            f_coord = batch_coord[0]
            l_coord = batch_coord[-1]
            # first coord y1,y2 and second coord y1,y2 must be equal if it is same phrase
            if f_coord[1] == l_coord[1] and f_coord[3] == l_coord[3]:
              # check first coord x2 and second coord x1 is more than 100(means the same line phrase occurrence)
              if (l_coord[0] - f_coord[2]) > 100:
                  phrase_coords.extend([
                      (f_coord[0], f_coord[1], f_coord[2], f_coord[3]),
                      (l_coord[0], l_coord[1], l_coord[2], l_coord[3])
                  ])
              else:
                phrase_coords.extend([(f_coord[0], f_coord[1], l_coord[2], l_coord[3])])
            for phrase_coord in phrase_coords:
                # print(f"phrase_coord: {phrase_coord}")
                if is_exact_match(p_page, p_keyword_impairment, phrase_coord, full_match=True, case_sensitive=True) \
                  and not is_coord_already_highlighted(self.highlighted_coord_list, phrase_coord):
                  p_highlight = p_page.add_highlight_annot(phrase_coord)
                  p_highlight.update()
                  filter_coords.append(phrase_coord)
                  self.highlighted_coord_list.append(phrase_coord)
                # print(f"key_phrase: {p_keyword_impairment}, phrase_coord: {phrase_coord}")
        except ValueError as ve:
            #print(f"ERROR: coordinate not found>>{ve}")
            self.logger.error(f"ERROR: coordinate not found>>{ve}")
        return filter_coords

      def get_json_data(coords, k_impairment):
        return {
            "coords": coords,
            "synid": get_synid(k_impairment, self.synid_df),
            # "keyword_field": get_keyword_field(key_phrase_sent, synid_df)
          }

      keyword_impairments = self.impairment_keyword_dict[page_num]
      for keyword_impairment in keyword_impairments:
          # print(f"key_phrase_sent: {keyword_impairment}")
          coordinates = p_page.search_for(keyword_impairment)
          # print(f"coordinates: {coordinates}")

          phrase_len = len(keyword_impairment.split())
          if coordinates:
            # highlight word phrase that have equal word phrase and coord length(single phrase occurrence)
            if phrase_len == len(coordinates) and phrase_len != 1:
                imp_keywords_coord_dict[keyword_impairment] = get_json_data(highlight_multi_key_phrase(keyword_impairment, coordinates), keyword_impairment)
            # highlight word phrase that do not have equal word phrase and coord length(multiple phrase occurrence)
            elif int(len(coordinates) / phrase_len) * phrase_len == len(coordinates):
                phrase_coord_list = []
                # batch coord list by dividing its length with phrase length
                for batch_coord in more_itertools.batched(coordinates, phrase_len):
                    phrase_coord_list.extend(highlight_multi_key_phrase(keyword_impairment, batch_coord))
                imp_keywords_coord_dict[keyword_impairment] = get_json_data(phrase_coord_list, keyword_impairment)
            else:
                coords_list = []
                for coord in coordinates:
                    if is_exact_match(p_page, keyword_impairment, coord, full_match=True, case_sensitive=True) and not is_coord_already_highlighted(self.highlighted_coord_list, coord):
                        page_highlight = p_page.add_highlight_annot(coord)
                        page_highlight.update()
                        coords_list.extend((coord[0], coord[1], coord[2], coord[3]))
                        self.highlighted_coord_list.append(coord)
                        # print(f"key_phrase: {keyword_impairment}, keywords_coord: {coord}")
                # print(f"coords_list: {coords_list}")
                imp_keywords_coord_dict[keyword_impairment] = get_json_data(coords_list, keyword_impairment)
      # print(f"imp_keywords_coord_dict: {imp_keywords_coord_dict}")
      if imp_keywords_coord_dict:
        json_imp_data_list.extend(prepare_imp_json_object(num_page, imp_keywords_coord_dict))
      return json_imp_data_list

    def highlight_date_and_time(self, p_page, page_num):
        num_page = page_num + 1
        json_date_data_list = []
        highlight_coords_list = []
        page_date_coord_dict = {}

        def highlight_date(p_coordinate, p_date):
          page_highlight = p_page.add_highlight_annot(p_coordinate)
          page_highlight.set_colors(stroke=[0.529, 0.807, 0.921])  # sky blue
          page_highlight.update()
          page_date_coord_dict[p_date] = [p_coordinate[0], p_coordinate[1], p_coordinate[2], p_coordinate[3]]
          self.highlighted_term_list.append(p_date)

        page_date_list = self.page_matched_date_dict[page_num]
        for page_date in page_date_list:
            coordinates = p_page.search_for(page_date)
            for coordinate in coordinates:
              if len(re.findall(r"^\d+$", page_date.strip())) > 0:
                if is_exact_year_match(p_page, page_date, coordinate, self.highlighted_term_list):
                  highlight_date(coordinate, page_date)
              else:
                highlight_date(coordinate, page_date)

        if page_date_coord_dict:
            highlight_coords_list.append(prepare_page_date_json_object(num_page, page_date_coord_dict))
        if len(highlight_coords_list) > 0:
            json_date_data_list.extend(highlight_coords_list)
        return json_date_data_list

    def highlight_common_words(self, page_obj, common_words_coord_dict):
        highlight_coords_dict = {}
        highlighted_word_list = []

        prev_common_word = ""
        for common_word, common_word_coord in common_words_coord_dict.items():
            highlight = page_obj.add_highlight_annot(common_word_coord)
            highlight.update()
            if prev_common_word.lower() != common_word.lower():
                highlight_coords_dict[common_word] = common_word_coord
                highlighted_word_list.append(common_word)
                prev_common_word = common_word
        return highlight_coords_dict, highlighted_word_list

    def get_best_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
        all_match_keyword_list = []
        all_match_imp_keyword_list = []

        # Step 1: reverse code pattern
        reversed_icd_code = reverse_code_pattern(p_code)
        # Step 2: fetch keyword based on code
        keyword = get_keyword(reversed_icd_code, self.code_df)
        # Step 3: get code paragraph
        code_paragraph_list, non_code_paragraph_list = get_paragraph(page_obj, code_y2_coord)
        # print(f"code_paragraph_list: {non_code_paragraph_list}")

        # Step 4: prepare code_paragraph for common words
        for code_paragraph in code_paragraph_list:
            match_keyword_dict = {}
            common_words = get_common_words(keyword, code_paragraph)
            if len(common_words) > 0:
                match_keyword_dict["common_words"] = common_words
                # Step 4: get best token match ratio
                clean_paragraph = " ".join(clean_text(code_paragraph))
                match_keyword_dict["paragraph"] = clean_paragraph
                # match_score = fuzz.token_set_ratio(keyword, clean_paragraph)
                # if fuzz.token_set_ratio(keyword, clean_paragraph) >= match_threshold else 0
                match_keyword_dict["score"] = fuzz.token_set_ratio(keyword, clean_paragraph)
                # Step 5: build common words coordinate dict
                common_words_coord_dict = {}
                #print(f"code_y2_coord: {code_y2_coord}")
                for common_word in common_words:
                    highlight_list = page_obj.search_for(common_word)
                    #print(f"highlight_list: {highlight_list}")
                    found_coord = False
                    for highlight in highlight_list:
                        # get common word y2 coord value
                        common_word_y2_coords = highlight[3]
                        common_word_y1_coords = highlight[1]
                        if (code_y2_coord - 2) <= common_word_y2_coords <= (code_y2_coord + 2):
                          common_words_coord_dict[common_word] = highlight
                          found_coord = True
                        if not found_coord:
                          if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                            common_words_coord_dict[common_word] = highlight
                            # self.already_found_word_list.append(common_word)
                match_keyword_dict["common_words_coords"] = common_words_coord_dict
                all_match_keyword_list.append(match_keyword_dict)

        def get_keyword_impairment_list(p_paragraph):
            keyword_impairments = []
            for keyword_impairment in self.impairment_keyword_dict[page_num]:
                keyword_impairment_found = get_common_words(keyword_impairment, p_paragraph)
                if len(keyword_impairment_found) > 0:
                    keyword_impairments.append(keyword_impairment)
            return keyword_impairments

        # Step 5: prepare non code_paragraph for common words
        for non_code_paragraph in non_code_paragraph_list:
            match_imp_keyword_dict = {}
            imp_common_words_coord_dict = {}
            keyword_impairment_list = get_keyword_impairment_list(non_code_paragraph)
            for keyword_impairment in keyword_impairment_list:
                highlight_list = page_obj.search_for(keyword_impairment)
                highlight_list = [highlight for highlight in highlight_list if highlight[3] not in self.code_y2_coords_list]
                if keyword_impairment not in self.already_found_word_list and len(highlight_list) > 0:
                    self.already_found_word_list.append(keyword_impairment)
                    imp_common_words_coord_dict[f"{keyword_impairment}"] = highlight_list
                    continue
            match_imp_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
            all_match_imp_keyword_list.append(match_imp_keyword_dict)

        return all_match_keyword_list, all_match_imp_keyword_list

    def get_best_all_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
        all_match_keyword_list = []
        all_match_imp_keyword_list = []

        # Step 1: reverse code pattern
        reversed_icd_code = reverse_code_pattern(p_code)
        # Step 2: fetch keyword based on code
        keyword = get_keyword(reversed_icd_code, self.code_df)
        # Step 3: get all paragraph
        all_paragraph_list = get_all_paragraph(page_obj)
        # print(f"code_paragraph_list: {non_code_paragraph_list}")

        def get_keyword_impairment_list(p_paragraph):
          keyword_impairments = []
          for keyword_impairment in self.impairment_keyword_dict[page_num]:
              keyword_impairment_found = get_common_words(keyword_impairment, p_paragraph)
              if len(keyword_impairment_found) > 0:
                  keyword_impairments.append(keyword_impairment)
          return keyword_impairments

        # Step 4: prepare all_paragraph for common words and impairment
        for all_paragraph in all_paragraph_list:
          match_keyword_dict = {}
          common_words = get_common_words(keyword, all_paragraph)
          if len(common_words) > 0:
              match_keyword_dict["common_words"] = common_words
              # Step 4: get best token match ratio
              clean_paragraph = " ".join(clean_text(all_paragraph))
              match_keyword_dict["paragraph"] = clean_paragraph
              # match_score = fuzz.token_set_ratio(keyword, clean_paragraph)
              # if fuzz.token_set_ratio(keyword, clean_paragraph) >= match_threshold else 0
              match_keyword_dict["score"] = fuzz.token_set_ratio(keyword, clean_paragraph)
              # Step 5: build common words coordinate dict
              common_words_coord_dict = {}
              for common_word in common_words:
                  highlight_list = page_obj.search_for(common_word)
                  found_coord = False
                  for highlight in highlight_list:
                      # get common word y2 coord value
                      common_word_y2_coords = highlight[3]
                      if (code_y2_coord - 2) <= common_word_y2_coords <= (code_y2_coord + 2):
                        common_words_coord_dict[common_word] = highlight
                        found_coord = True
                      if not found_coord:
                        if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                          common_words_coord_dict[common_word] = highlight
                          # self.already_found_word_list.append(common_word)
              match_keyword_dict["common_words_coords"] = common_words_coord_dict
              all_match_keyword_list.append(match_keyword_dict)
          else:
            # Step 5: prepare non code_paragraph for common words
            match_imp_keyword_dict = {}
            imp_common_words_coord_dict = {}
            keyword_impairment_list = get_keyword_impairment_list(all_paragraph)
            for keyword_impairment in keyword_impairment_list:
              #print(f"keyword_impairment11: {keyword_impairment}")
              highlight_list = page_obj.search_for(keyword_impairment)
              if keyword_impairment not in self.already_found_word_list and len(highlight_list) > 0:
                self.already_found_word_list.append(keyword_impairment)
                imp_common_words_coord_dict[f"{keyword_impairment}"] = highlight_list
                #print(f"keyword_impairment22: {keyword_impairment}, coords: {highlight_list}")
                #continue
            match_imp_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
            all_match_imp_keyword_list.append(match_imp_keyword_dict)

        return all_match_keyword_list, all_match_imp_keyword_list

    def search_icd_code(self, txt_list):
      pdf_page_vocab = {}
      for txt_file in txt_list:
        with open(txt_file, "r") as f:
          page_txt = f.read()
          # check the page that have line number instead of code
          index_page = False
          if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
            index_page = True

          doc = self.nlp_code10(page_txt)
          code_list = []
          if index_page:
            # check the code contain letter "L"
            code_list = [ent.text for ent in doc.ents if not re.search("(L[0-9]+)", ent.text)]
          else:
            code_list = [ent.text for ent in doc.ents]

          if len(code_list) != 0:
            # page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
            page_number = int(txt_file.split("/")[-1].split(".")[0].split("-")[1])
            pdf_page_vocab[page_number] = list(set(code_list))
            # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
      return pdf_page_vocab

##PDF Highlighting

In [None]:
!mkdir -p /home/ocreng
!mkdir -p /home/ocreng/ocrhigh
!mkdir -p /home/ocreng/ocrhigh/input
!mkdir -p /home/ocreng/ocrhigh/output
!mkdir -p /home/ocreng/ocrhigh/processed
!mkdir -p /home/ocreng/ocrhigh/pdf-files
!mkdir -p /home/ocreng/ocrhigh/txt-files

In [None]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [None]:
%%time
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = cfg.file_path["INPUT_PATH"]

highlighter = Highlighter(match_threshold=35)
sent_extractor = SentenceExtractor()
generic_matcher = GenericMatcher()

CPU times: user 1min 13s, sys: 5.16 s, total: 1min 18s
Wall time: 1min 24s


```log
CPU times: user 1min 7s, sys: 6.54 s, total: 1min 14s
Wall time: 1min 15s
```

###Test

In [None]:
!rm -rf /home/ocreng/ocrhigh/input
!mkdir -p /home/ocreng/ocrhigh/input

In [None]:
!cp Practice_Copy_Test_NEWOCR.pdf /home/ocreng/ocrhigh/input
#!cp page-10.pdf /home/ocreng/ocrhigh/input

In [None]:
!rm -rf /home/ocreng/ocrhigh/output
!mkdir -p /home/ocreng/ocrhigh/output
!rm -rf output*.zip

In [None]:
output_type = 4

In [None]:
%%time
txt_list = None
icd10_code_dict1 = None
wrong_keyword_dict = None
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"

  # Step-1: splitting pdf file
  print("Step-1: Splitting pdf file............")
  pdf_list = split_pdf(highlighter.PDF_FILES_PATH, pdf_file_name)

  # Step-2: Extracting text from pdf
  print("Step-2: Extracting text from pdf............")
  txt_list = extract_text_from_pdf(highlighter.PDF_FILES_PATH, highlighter.TXT_FILES_PATH, pdf_list)
  highlighter.set_text_list(txt_list)

  # Step-3: Searching ICD-10 code
  print("Step-3: Searching ICD-10 code into text file..........")
  icd10_code_dict = highlighter.search_icd_code(txt_list)
  icd10_code_dict1 = icd10_code_dict

  # Step-4: Get closet match of keyword impairment
  print("Step-4: Get closet match of keyword impairment..........")
  matched_keyword_dict = sent_extractor.get_matched_keyword_dict(txt_list)
  highlighter.set_impairment_keyword_dict(matched_keyword_dict)
  wrong_keyword_dict = matched_keyword_dict

  # Step-5: Get date time list from text file
  print("Step-5: Get date time list from text file..........")
  matched_date_dict = generic_matcher.get_match_date_dict(txt_list)
  highlighter.set_page_matched_date_dict(matched_date_dict)

  if output_type == 1:
    # Step-6: Highlighting ICD-10 code, its description and keyword impairment into PDF file
    print("Step-6: Highlighting ICD-10 code, its description and keyword impairment into PDF file............")
    pdf_output_file, json_code_output_file = highlighter.highlight_all(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  elif output_type == 2:
    # Step-7: Highlighting ICD-10 code, its description and all keyword impairment into pdf
    print("Step-7: Highlighting ICD-10 code, its description and all keyword impairment into pdf............")
    pdf_output_file, json_code_output_file = highlighter.highlight_all(icd10_code_dict, pdf_file_name=pdf_file_name, highlight_all=True)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  elif output_type == 3:
    # Step-8: Highlighting ICD-10 code and its description into pdf
    print("Step-8: Highlighting ICD-10 code and its description into pdf............")
    pdf_output_file, json_output_file = highlighter.highlight_icd_code_and_common_words(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
    print(f"Highlighted coordinates are saved into [{json_output_file}] file.")
  elif output_type == 4:
    # Step-9: Highlighting ICD-10 code and all keyword impairment into pdf
    print("Step-9: Highlighting ICD-10 code and all keyword impairment into pdf............")
    pdf_output_file, json_code_output_file = highlighter.highlight_icd_code_and_keyword_impairment(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and all keyword impairment ")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")
  elif output_type == 5:
    # Step-10: Highlighting keyword impairment into pdf
    print("Step-10: Highlighting keyword impairment into pdf............")
    pdf_output_file, json_output_file = highlighter.highlight_keyword_impairment(pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting keyword impairment.")
    print(f"Highlighted coordinates are saved into [{json_output_file}] file.")
  else:
    print("Please pass value 1, 2 and 3 for output type.")

  # Step-11: Clean up: move the current file into processed folder
  if output_type in [1, 2, 3, 4, 5]:
    if Path(f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}").exists():
      # take backup of existing file
      shutil.move(f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}", f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}_bkp")
      # then move it
      shutil.move(pdf_file_name, cfg.file_path["PROCESSED_PATH"])
    else:
      shutil.move(pdf_file_name, cfg.file_path["PROCESSED_PATH"])

  # remove all pdf and text files
  purge(f"{cfg.file_path['TMP_PDF_FILES_PATH']}/*.pdf")
  purge(f"{cfg.file_path['TMP_TXT_FILES_PATH']}/*.txt")
  pdf_list = []
  txt_list = []

Step-1: Splitting pdf file............
Step-2: Extracting text from pdf............
Step-3: Searching ICD-10 code into text file..........
Step-4: Get closet match of keyword impairment..........
Step-5: Get date time list from text file..........
Step-9: Highlighting ICD-10 code and all keyword impairment into pdf............
File[/home/ocreng/ocrhigh/output/Practice_Copy_Test_NEWOCR_output_4.pdf] is saved after highlighting ICD-10 code and all keyword impairment 
Highlighted code and impairment coordinates are saved into [/home/ocreng/ocrhigh/output/Practice_Copy_Test_NEWOCR_output_4.json] file.
CPU times: user 1min 7s, sys: 409 ms, total: 1min 8s
Wall time: 1min 13s


In [None]:
!zip output{output_type}.zip /home/ocreng/ocrhigh/output/*.*

  adding: home/ocreng/ocrhigh/output/Practice_Copy_Test_NEWOCR_output_4.json (deflated 81%)
  adding: home/ocreng/ocrhigh/output/Practice_Copy_Test_NEWOCR_output_4.pdf (deflated 21%)


In [None]:
#matched_date_dict[13]

In [None]:
#wrong_keyword_dict[0]

In [None]:
if not "T?".find("?") > -1 or not "T)".find(")") > -1:
  print("yes")

###Split TXT/PDF

In [None]:
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"

  # Step-1: splitting pdf file
  print("Step-1: Splitting pdf file............")
  pdf_list = split_pdf(highlighter.PDF_FILES_PATH, pdf_file_name)

  # Step-2: Extracting text from pdf
  print("Step-2: Extracting text from pdf............")
  txt_list = extract_text_from_pdf(highlighter.PDF_FILES_PATH, highlighter.TXT_FILES_PATH, pdf_list)

Step-1: Splitting pdf file............
Step-2: Extracting text from pdf............


In [None]:
!zip text_files.zip /home/ocreng/ocrhigh/txt-files/*.*
!zip pdf_files.zip /home/ocreng/ocrhigh/pdf-files/*.*

###Split List

In [None]:
def split_list(a_list, divider):
  half = len(a_list)//divider
  return a_list[:half], a_list[half:]

A = [1,2,3,4,5,6]
B, C = split_list(A, 3)
B, C

([1, 2], [3, 4, 5, 6])

In [None]:
6 // 3

2

In [None]:
int(4 / 2)

2

In [None]:
!pip install more-itertools



In [None]:
import more_itertools

In [None]:
for batch in more_itertools.batched([1,2,3,4,5,6], 3):
  print(batch)

(1, 2, 3)
(4, 5, 6)


In [None]:
for batch in more_itertools.batched("ABCDEFGHIJ", 4):
  print(batch)

In [None]:
def split_list(a_list, divider):
  half = len(a_list) // divider
  return a_list[:half], a_list[half:]

In [None]:
L =np.array([
(63.5, 166.5, 81.5, 175.50439453125),
(84.5, 166.5, 133.59002685546875, 175.50439453125),
(130.0, 264.0, 147.49998474121094, 273.00439453125),
(150.5, 264.0, 199.59002685546875, 273.00439453125),
(150.5, 264.0, 199.59002685546875, 273.00439453125),
(150.5, 264.0, 199.59002685546875, 273.00439453125)
])

In [None]:
(63.5, 166.5, 133.59002685546875, 175.50439453125)
(130.0, 264.0, 199.59002685546875, 273.00439453125)

In [None]:
np.split(L, 3)

[array([[ 63.5       , 166.5       ,  81.5       , 175.50439453],
        [ 84.5       , 166.5       , 133.59002686, 175.50439453]]),
 array([[130.        , 264.        , 147.49998474, 273.00439453],
        [150.5       , 264.        , 199.59002686, 273.00439453]]),
 array([[150.5       , 264.        , 199.59002686, 273.00439453],
        [150.5       , 264.        , 199.59002686, 273.00439453]])]

In [None]:
LL= [
(23.5, 443.0, 77.00015258789062, 454.00537109375),
(79.5, 443.0, 122.99948120117188, 454.00537109375),
(126.5, 443.0, 142.1002197265625, 454.00537109375),
(384.5, 478.5, 430.0002136230469, 485.50341796875),
(195.49978637695312, 487.0, 240.9998321533203, 494.00341796875),
(246.5, 487.0, 260.5, 494.00341796875)]

In [None]:
len(LL) // 3

2

In [None]:
for batch_coord in more_itertools.batched(LL, 2):
  f_coord = batch_coord[0]
  l_coord = batch_coord[-1]
  print(f"f_coord: {f_coord}")
  print(f"l_coord: {l_coord}")
  phrase_coord = (f_coord[0], f_coord[1], l_coord[2], l_coord[3])
  print(f"key_phrase: CTA DISSECTION, phrase_coord: {phrase_coord}")

f_coord: (23.5, 443.0, 77.00015258789062, 454.00537109375)
l_coord: (79.5, 443.0, 122.99948120117188, 454.00537109375)
key_phrase: CTA DISSECTION, phrase_coord: (23.5, 443.0, 122.99948120117188, 454.00537109375)
f_coord: (126.5, 443.0, 142.1002197265625, 454.00537109375)
l_coord: (384.5, 478.5, 430.0002136230469, 485.50341796875)
key_phrase: CTA DISSECTION, phrase_coord: (126.5, 443.0, 430.0002136230469, 485.50341796875)
f_coord: (195.49978637695312, 487.0, 240.9998321533203, 494.00341796875)
l_coord: (246.5, 487.0, 260.5, 494.00341796875)
key_phrase: CTA DISSECTION, phrase_coord: (195.49978637695312, 487.0, 260.5, 494.00341796875)


In [None]:
key_phrase_sent = "plan of care"
coordinates = [
    (303.35198974609375, 578.380859375, 350.51043701171875, 589.455078125),
    (432.93896484375, 647.3748779296875, 480.2064514160156, 658.4490966796875)]

phrase_len = len(key_phrase_sent.split())
print((len(coordinates) // phrase_len) )
for batch_coord in more_itertools.batched(coordinates, 3):
  f_coord = batch_coord[0]
  l_coord = batch_coord[-1]
  print(f"f_coord: {f_coord}")
  print(f"l_coord: {l_coord}")
  phrase_coord = (f_coord[0], f_coord[1], l_coord[2], l_coord[3])
  print(f"key_phrase: {key_phrase_sent}, phrase_coord: {phrase_coord}")

0
f_coord: (303.35198974609375, 578.380859375, 350.51043701171875, 589.455078125)
l_coord: (432.93896484375, 647.3748779296875, 480.2064514160156, 658.4490966796875)
key_phrase: plan of care, phrase_coord: (303.35198974609375, 578.380859375, 480.2064514160156, 658.4490966796875)


In [None]:
LL= np.array([
(23.5, 443.0, 77.00015258789062, 454.00537109375),
(79.5, 443.0, 122.99948120117188, 454.00537109375),
(126.5, 443.0, 142.1002197265625, 454.00537109375),
(384.5, 478.5, 430.0002136230469, 485.50341796875),
(195.49978637695312, 487.0, 240.9998321533203, 494.00341796875),
(246.5, 487.0, 260.5, 494.00341796875)])

for batch_coord in np.split(LL, 3):
  f_coord = batch_coord[0]
  l_coord = batch_coord[-1]
  print(f"f_coord: {f_coord}")
  print(f"l_coord: {l_coord}")
  phrase_coord = (f_coord[0], f_coord[1], l_coord[2], l_coord[3])
  print(f"key_phrase: CTA DISSECTION, phrase_coord: {phrase_coord}")

f_coord: [ 23.5        443.          77.00015259 454.00537109]
l_coord: [ 79.5        443.         122.9994812  454.00537109]
key_phrase: CTA DISSECTION, phrase_coord: (23.5, 443.0, 122.99948120117188, 454.00537109375)
f_coord: [126.5        443.         142.10021973 454.00537109]
l_coord: [384.5        478.5        430.00021362 485.50341797]
key_phrase: CTA DISSECTION, phrase_coord: (126.5, 443.0, 430.0002136230469, 485.50341796875)
f_coord: [195.49978638 487.         240.99983215 494.00341797]
l_coord: [246.5        487.         260.5        494.00341797]
key_phrase: CTA DISSECTION, phrase_coord: (195.49978637695312, 487.0, 260.5, 494.00341796875)


```log
f_coord: (63.5, 166.5, 81.5, 175.50439453125)
l_coord: (84.5, 166.5, 133.59002685546875, 175.50439453125)
key_phrase: CTA DISSECTION, phrase_coord: (63.5, 166.5, 133.59002685546875, 175.50439453125)
f_coord: (130.0, 264.0, 147.49998474121094, 273.00439453125)
l_coord: (150.5, 264.0, 199.59002685546875, 273.00439453125)
key_phrase: CTA DISSECTION, phrase_coord: (130.0, 264.0, 199.59002685546875, 273.00439453125)
```

###PDF OCRing

In [None]:
import pikepdf

def remove_password_from_pdf(input_file, output_file, password=None):
    pdf = pikepdf.open(input_file, password=password)
    pdf.save(output_file)

In [None]:
remove_password_from_pdf("Keyword_Sample_File#4_Redacted_Redacted.pdf",
                         "Keyword_Sample_File#4_Redacted_Redacted_pw_removed.pdf", "synodex")

In [None]:
!ocrmypdf --skip-text Sample_File_No3_Redacted_pw_removed.pdf Sample_File_No3_Redacted.pdf

Scanning contents:   0% 0/50 [00:00<?, ?page/s]Scanning contents:  46% 23/50 [00:00<00:00, 222.44page/s]Scanning contents: 100% 50/50 [00:00<00:00, 326.07page/s]
Start processing 2 pages concurrently
    1 skipping all processing on this page
    2 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   12 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   20 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   19 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   21 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   36 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
   46 [33m[tesseract] lots of diacritics - possibly poor OCR[0m
OCR: 100% 50.0/50.0 [01:42<00:00,  2.05s/page]
Postprocessing...
PDF/A conversion: 100% 50/50 [00:17<00:00,  2.80page/s]
Recompressing JPEGs: 0image [00:00, ?image/s]
Deflating JPEGs: 0image [00:00, ?image/s]
JBIG2: 0item [00:00, ?item/s]
Optimize ratio: 1.00 savings: -0.2%
Image optimizatio

In [None]:
!rm -rf Keyword_Sample_File4_Redacted_Redacted_pw_removed.pdf

##Exact Match

In [None]:
mytext = """
  Encounter #16
History & Physical Report
1/25/2021: Lab Orders - Hypertriglyceridemia,                 sporadic (E78.3) (Marina Dobricic)
   ppointment:    1/25/2021   9:00 AM
                                  / Race: White
Male
 The patient is a 41 year old male.
 Assessment & Plan (Angela Sabbagh; 1/23/2021            11:36 AM)
 Hypertriglyceridemia, sporadic (E78.3)
 Impression: stopped will restart and check labs
Current Plans
       @   Comprehensive Metabolic Panel (CMP) (80053)
       @   Complete Blood Count with Differential (CBC) (85025)
       @   Lipid Panel (LP) (80061)
Signed by Marina Dobricic (1/27/2021        9:54 AM)
 Comprehensive Metabolic Panel (CMP) (80053) Final, Reviewed (Collected: 01/25/2021)
 Diagnosis: Hypertriglyceridemia, sporadic (E78.3)
 Note: Testing done at Silver Pine Medical         Group unless otherwise specified. 43455 Schoenherr Road, Suite 19, Sterling Heights, MI 48313
      Sodium                                  141 mmol/L              (Normal Range: 135-145 mmol/L)         Result Note:
                                                                    Resutt Annotation:
      Potassium                               4.1  mmol/L             (Normal Range: 3.5-5.2 mmol/L)        Result    Note:
                                                                    Result Annotation:
      Chloride                                103   mmol/L            (Normal Range: 98-107 mmol/L)         Result    Note:
                                                                    Result Annotation:
      Carbon    Dioxide (CO2)                 30 mmol/L               (Normal Range: 21-31    mmol/L)     Result     Note:
                                                                    Resutt Annotation:
      Anion Gap                               8                       (Normal Range: 5-17)      Result     Note:
                                                                    Resutt Annotation:
      Glucose                                 89 mg/dL                (Normal Range: 60-99 mg/dL)        Result     Note:
                                                                    Resutt Annotation:
      Blood Urea Nitrogen (BUN)               17 mg/dLl               (Normal Range: 8-23 mg/dL)        Result    Note:
                                                                    Resutt Annotation:
      Creatinine                              1.03   mg/dL            (Normal Range: 0.80-1.40 mg/dL)        Result     Note:
                                                                    Resutt Annotation:
      GFR                                     79 mL/min/1.73m2        (Normal Range: >59 mL/min/1.73m2)          Reeult      Note:
                                                                    Resutt Annotation:
      GFR   African American                  96 mL/min/1.73m2        (Normal Range: >59 mL/min/1.73m2)          Result      Note:
                                                                    Result Annotation:
      Calcium                                 9.0 mg/dL               (Normal Range: 8.5-11.0 mg/dL)        Result     Note;:
                                                                    Result Annotation:
      Protein Total                           7.8 g/dL                (Normal Range: 6.4-8.2 g/dL)      Result     Note:
                                                                    Resutt Annotation:
      Albumin                                 4.4 g/dL                (Normal Range: 3.4-5.0 g/dL)      Result     Note:
                                                                    Resutt Annotation:
      Globulin                                3.40                    (Normal Range: 2.20-4.00)       Result    Wote:
                                                                    Resutt Annotation:
      Album   in/Globulin Ratio               1.3                         Result Note:
                                                                    Resutt Annotation:
      Alkaline Phosphatase (ALP)              64 U/L                  (Normal Range: 50-116 U/L)       Result    Note:
                                                                    Resutt Annotation:
      Aspartate Aminotransferase (AST)        25  U/L                 (Normal Range: 15-37 U/L)      Result    Note:
                                                                    Resutt Annotation:
      Alanine Aminotransferase (ALT)          33 U/L                  (Normal Range: 16-63 U/L)      Reeult    Note:
                                                                    Resutt Annotation:
      Bilirubin Total                         0.5 mg/dL               (Normal Range:   0.3-1.2 mg/dL)     Result     Note:
                                                                    Resutt Annotation:
 Complete Blood Count with Differential (CBC) (85025) Final, Reviewed (Collected: 01/25/2021)
 Diagnosis: Hypertriglyceridemia, sporadic (E78.3)
 Note: Testina done at Silver Pine Medical Group unless otherwise specified. 43455 Schoenherr Road. Suite 19. Sterlina Heiahts. Ml 48313
06/28/2022 12:27 pm                                              s                                                                          Page 11/144
"""

In [None]:
term = "MI"
#matches = len(re.findall(r'(?i)\bAlbumin\b', mytext)) > 0
# matches = len(re.findall(f'(?i)\\b{term}\\b', mytext)) > 0
#if len(re.findall(f'(?i)\\b{term}\\b', mytext)) > 0:
matches = re.findall(r'\bMI\b', "mi mister miss")
matches

[]

In [None]:
with open("page-30.txt", "r") as f:
  mytext = f.read()
if len(re.findall(f'(?i)\\b{term}\\b', mytext)) > 0:
  print("Found")

FileNotFoundError: ignored

In [None]:
wrong_keyword_dict1[0]

In [None]:
icd10_code_dict1

In [None]:
%%time

text_list = split_pdf(highlighter.PDF_FILES_PATH, "APS_38600000R_final.pdf")
len(text_list)

##Highlight Test

In [None]:
pdf_file_name = "page-30.pdf"
pdf_file = fitz.open(pdf_file_name)
for page_num, page in enumerate(pdf_file):
  for keyword_impairment in wrong_keyword_dict1[0]:
    # print(f"keyword_impairment11: {keyword_impairment}")
    highlight = page.search_for(keyword_impairment)
    print(f"keyword_impairment: {keyword_impairment}, coords: {highlight}")
    highlight = page.add_highlight_annot(highlight)
    highlight.update()
output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)

keyword_impairment: Urea, coords: [Rect(70.31328582763672, 435.0400390625, 83.75199890136719, 443.0439453125)]
keyword_impairment: Comprehensive Metabolic Panel, coords: [Rect(60.47419357299805, 207.52001953125, 118.06912994384766, 215.52392578125), Rect(121.66831970214844, 207.52001953125, 156.94491577148438, 215.52392578125), Rect(159.82464599609375, 207.52001953125, 176.2230682373047, 215.52392578125), Rect(29.997119903564453, 282.8800048828125, 95.99125671386719, 290.8839111328125), Rect(99.83040618896484, 282.8800048828125, 141.58639526367188, 290.8839111328125), Rect(144.46612548828125, 282.8800048828125, 163.6642608642578, 290.8839111328125)]
keyword_impairment: ALT, coords: [Rect(148.30589294433594, 665.4400634765625, 158.38491821289062, 673.4439697265625)]
keyword_impairment: GFR, coords: [Rect(45.355674743652344, 477.0400390625, 57.77448272705078, 485.0439453125), Rect(45.35564422607422, 498.1600341796875, 53.8148307800293, 506.1639404296875)]
keyword_impairment: CBC, coords:

##ICD 10 Code

In [None]:
!mkdir -p pdf-files
!mkdir -p txt-files

In [None]:
# define directory path after creating it
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

# create nlp instance
nlp = English()


def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfReader(pdf_in_file)
  pdf_list = []
  for page in range(len(pdf.pages)):
      inputpdf = PdfReader(pdf_in_file)
      output = PdfWriter()
      output.add_page(inputpdf.pages[page])
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
        with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
            pdf = pdftotext.PDF(f)

        # Read all the text into one string
        pdf_text = "\n\n".join(pdf)

        # write text into file
        with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
            f.write(pdf_text)
        txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
        i += 1
    return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def highlight_icd10_code(pdf_page_dict: dict, pdf_file_name: str):
    pdf_file = fitz.open(pdf_file_name)

    def highlight_pdf(highlight):
        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.update()
          highlight = page.search_for(text_to_be_highlighted)
          print(f"Page-{page_num}: ", code, highlight, end='\n')

    for page_num, page in enumerate(pdf_file):
        if page_num in pdf_page_dict:
          for code in pdf_page_dict[page_num]:
            text_to_be_highlighted = code
            highlight = page.search_for(text_to_be_highlighted)
            print(f"Page-{page_num}: ", code, highlight, end='\n')
            if len(highlight) == 0:
                alternate_code_list = get_opt_pattern(code)
                for alt_code in alternate_code_list:
                  text_to_be_highlighted = alt_code
                  highlight = page.search_for(text_to_be_highlighted)
                  # highlight pdf for option pattern
                  highlight_pdf(highlight)
            # highlight pdf for main pattern
            highlight_pdf(highlight)

    output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
    pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)
    return output_pdf_file_name


def search_icd_10_code(txt_list):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          #print(txt_file)
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [None]:
# Step-1: splitting pdf file
pdf_file_name = "28page.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v1.jsonl")

# Step-4: Searching ICD-10 code
#print (txt_list)
pdf_page_vocab = search_icd_10_code(txt_list)

# Step-5: Highlighting ICD-10 code into pdf
output_file_name = highlight_icd10_code(pdf_page_vocab, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code")

##Phrase matching

In [None]:
import spacy
from spacy.matcher import PhraseMatcher

In [None]:
synid_df = pd.read_csv(cfg.csv_files["IMP_CSV"])
synid_df.head()

Unnamed: 0,SynId,Short_Description
0,KW20262,US thyroid
1,KW20261,no graphic evidence of malignancy
2,KW20260,no evidence of malignancy
3,KW20259,scattered areas of fibroglandular density
4,KW20258,scattered areas of fibroglandular density


In [None]:
def make_icd_10_keyword_pattern(synid_df):
  patterns = []
  for _, row in synid_df.iterrows():
    patterns.append(row["Short_Description"])
  return patterns

In [None]:
keywords = make_icd_10_keyword_pattern(synid_df)
keywords[:10]

['US thyroid',
 'no graphic evidence of malignancy',
 'no evidence of malignancy',
 'scattered areas of fibroglandular density',
 'scattered areas of fibroglandular density',
 'BI-RADS 6',
 'BI-RADS 5',
 'BI-RADS 4C',
 'BI-RADS 4B',
 'BI-RADS 4A']

In [None]:
nlp = spacy.load('en_core_web_sm')
phrase_matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)

In [None]:
with open("page-2.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['T', 'HHS', 'US']

In [None]:
with open("page-25.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Headache']

In [None]:
with open("page-36.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Diplopia',
 'Headache',
 'Visual Loss',
 'Gynecomastia',
 'Dysphagia',
 'Hematuria']

In [None]:
synid_df = pd.read_csv("synid_and_keywords_impairment.csv")
keywords = make_icd_10_keyword_pattern(synid_df)

In [None]:
phrase_matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)

In [None]:
with open("page-36.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

In [None]:
keyword_list

['Hypertriglyceridemia',
 'Hypertriglyceridemia',
 'EKG',
 'Hypertension',
 'Atrial Fibrillation',
 'Diplopia',
 'Headache',
 'Visual Loss',
 'Gynecomastia',
 'Dysphagia',
 'Hematuria',
 'Headaches']

In [None]:
wrong_keyword_dict1[30]

{'ALP',
 'ALT',
 'AST',
 'Albumin',
 'Alkaline Phosphatase',
 'Anion Gap',
 'Aspartate',
 'BUN',
 'Bilirubin Total',
 'Blood Urea Nitrogen',
 'CBC',
 'CMP',
 'CO2',
 'Calcium',
 'Chloride',
 'Complete Blood Count',
 'Comprehensive Metabolic Panel',
 'Creatinine',
 'Encounter',
 'GFR',
 'Glucose',
 'Hypertriglyceridemia',
 'Lipid Panel',
 'MI',
 'Potassium',
 'Sodium',
 'Urea',
 'Urea Nitrogen',
 'ppointment'}

In [None]:
keywords = [keyword for keyword in wrong_keyword_dict1[30]]
len(keywords)

29

In [None]:
len(list(set(keywords)))

29

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
phrase_matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)

In [None]:
with open("page-30.txt", "r") as f:
  page_txt = f.read()
  # filter the page that have line number instead of code
  #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
  doc = nlp(page_txt)
  matches = phrase_matcher(doc)

  keyword_list = []
  for match_id, start, end in matches:
    span = doc[start: end]
    keyword_list.append(f"{span}")

print(len(keyword_list))
len(list(set(keyword_list)))

37


29

In [None]:

phrase_matcher = PhraseMatcher(nlp.vocab)

keywords = [keyword for keyword in wrong_keyword_dict1[30]]
pattern1 = [{'ORTH': keywords}]
phrase_matcher.add('keywords', None, pattern1)

TypeError: ignored