<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/19_icd_10_code_common_keyword_and_keyword_impairment_highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [None]:
import re
import os
import glob
import json

import numpy as np
import pandas as pd

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

from spacy.lang.en import English

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from concurrent import futures
from keyword_extraction import call

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!mkdir -p input_files

##Core Classes

In [None]:
class Highlighter:
    def __init__(self):
      # loading and updating patterns for ICD-10 code
      self.nlp_code10 = English()
      self.nlp_code10.add_pipe("entity_ruler").from_disk("icd10_code_patterns-v5.jsonl")

      # loading stop words
      self.stop_words = set(stopwords.words('english'))

      # define icd-10 code dataset
      self.code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

      # define required directory path
      self.PDF_FILES_PATH = "pdf-files"
      self.TXT_FILES_PATH = "txt-files"
      self.OUTPUT_FILES_PATH = "output"
      create_directory(self.PDF_FILES_PATH)
      create_directory(self.TXT_FILES_PATH)
      create_directory(self.OUTPUT_FILES_PATH)

    def set_impairment_keyword_dict(self, imp_keyword_dict):
      self.impairment_keyword_dict = imp_keyword_dict

    def split_pdf(self, pdf_path):
        pdf_in_file = open(pdf_path, "rb")
        pdf = PdfReader(pdf_in_file)
        pdf_list = []
        for page in range(len(pdf.pages)):
            input_pdf = PdfReader(pdf_in_file)
            output = PdfWriter()
            output.add_page(input_pdf.pages[page])
            with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
                output.write(outputStream)
                pdf_list.append(f"page-{page}.pdf")
        return pdf_list

    def extract_text_from_pdf(self, pdf_list):
        txt_file_list = []
        i = 0
        for pdf_file in pdf_list:
            with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
                pdf = pdftotext.PDF(f)

            # Read all the text into one string
            pdf_text = "\n\n".join(pdf)

            # write text into file
            with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
                f.write(pdf_text)
            txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
            i += 1
        self.text_list = txt_file_list
        return txt_file_list

    def highlight_icd_code(self, icd10_code_dict, match_threshold=30, coordinate=False, pdf_file_name=None,
                           cords_file_name=None):
        pdf_file = fitz.open(pdf_file_name)
        # create file to write coordinate
        txt_output_file_name = open(f"{self.OUTPUT_FILES_PATH}/{cords_file_name}", "a")
        json_data = []
        # add file header
        txt_output_file_name.write(
            "| Page | Found Code | Actual ICD10-Code | ICD 10 description | Common Words | Matched paragraph | confidence |\n ")

        def highlight_code(page_obj, p_highlight, match_score):
            # highlight code if threshold is more than match threshold and common words exists
            if match_score >= match_threshold:
                page_highlight = page_obj.add_highlight_annot(p_highlight)
                page_highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
                page_highlight.update()
            else:
                # highlight and set color coding dont have common words
                page_highlight = page.add_highlight_annot(highlight)
                page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                page_highlight.update()

        def highlight_common_words(page_obj, common_words_coord_dict):
            highlight_coords_dict = {}
            highlighted_word_list = []
            for common_word, common_word_coord in common_words_coord_dict.items():
                highlight = page_obj.add_highlight_annot(common_word_coord)
                highlight.update()
                highlight_coords_dict[common_word] = common_word_coord
                highlighted_word_list.append(common_word)
            return highlight_coords_dict, highlighted_word_list

        for page_num, page in enumerate(pdf_file):
            # highlight ICD-10 code
            if page_num in icd10_code_dict:
                for code in icd10_code_dict[page_num]:
                    highlight_list = page.search_for(code)
                    num_page = page_num + 1
                    for highlight in highlight_list:
                        print(f"Page-{num_page}-{code}, Coordinate: {highlight}")
                        json_data_object = {}
                        keyword = self.get_keyword(code)
                        # get match score and common words coordinate
                        all_match_keyword_list, all_match_imp_keyword_list = self.get_best_token_match(page_num, code, page, highlight[3], match_threshold)

                        # Step-1: highlight and set color coding dont have common words
                        if not all_match_keyword_list:
                            page_highlight = page.add_highlight_annot(highlight)
                            page_highlight.set_colors(stroke=[0.92, 0.59, 0.48])  # dark salmon
                            page_highlight.update()
                            # prepare json object
                            json_data_object[f"page-{num_page}"] = {
                                "icd10_code": reverse_code_pattern(code),
                                "page_icd10_code": code,
                                "page_icd10_code_coord": f"{highlight}",
                                "icd10_code_desc": keyword if keyword else "UNVALIDATED",
                                "page_icd_common_words": "UNVALIDATED",
                                "page_icd_keywords_coord": {},
                                "page_icd_keywords_found": "N",
                                "match_confidence": 0
                            }
                            json_data.append(json_data_object)
                            # log data to file
                            code_cors_output = (f"|Page-{num_page} | {code} | {reverse_code_pattern(code)}"
                                                f"| {keyword if keyword else 'No keyword found'} | 'UNVALIDATED' | 0 |")
                            txt_output_file_name.write("%s\n" % code_cors_output)
                            if coordinate:
                                txt_output_file_name.write("%s\n" % f"|{reverse_code_pattern(code)}:{highlight}|")

                        # Step-2: highlight and set color coding that have common words
                        for match_keyword_dict in all_match_keyword_list:
                          # highlight ICD-10 code
                          highlight_code(page, highlight, match_keyword_dict["score"])
                          # highlight common words
                          highlight_coords_dict = {}
                          highlight_coords_dict, highlighted_word_list = highlight_common_words(page,
                                                                                                match_keyword_dict[
                                                                                                    "common_words_coords"])
                          # write all info into text file
                          if match_keyword_dict["score"] >= match_threshold:
                              # | Page | Found Code | Actual ICD10-Code | ICD 10 description | Common Words | confidence |
                              code_cors_output = (f"|Page-{num_page} | {code} | {reverse_code_pattern(code)}"
                                                  f" | {keyword if keyword else 'No keyword found'}"
                                                  f" | {match_keyword_dict['common_words'] if len(match_keyword_dict['common_words']) > 0 else 'UNVALIDATED'}"
                                                  f" | {match_keyword_dict['paragraph']} | {match_keyword_dict['score']} |")
                              txt_output_file_name.write("%s\n" % code_cors_output)
                          else:
                              code_cors_output = (
                                  f"|Page-{num_page} | {code} | {reverse_code_pattern(code)} | {keyword if keyword else 'No keyword found'}"
                                  f" | {match_keyword_dict['common_words'] if len(match_keyword_dict['common_words']) > 0 else 'UNVALIDATED'}"
                                  f" | {match_keyword_dict['paragraph']} | {match_keyword_dict['score']} |")
                              txt_output_file_name.write("%s\n" % code_cors_output)
                          if coordinate:
                              txt_output_file_name.write("%s\n" % f"|{reverse_code_pattern(code)}:{highlight}|")
                              txt_output_file_name.write("%s\n" % f"|{highlight_coords_dict}|")

                          # prepare json object
                          json_data_object[f"page-{num_page}"] = {
                              "icd10_code": reverse_code_pattern(code),
                              "page_icd10_code": code,
                              "page_icd10_code_coord": f"{highlight}",
                              "icd10_code_desc": keyword if keyword else "UNVALIDATED",
                              "page_icd_common_words": match_keyword_dict["common_words"] if len(
                                  match_keyword_dict["common_words"]) > 0 else 'UNVALIDATED',
                              "page_icd_keywords_coord": f"{highlight_coords_dict}",
                              "page_icd_keywords_found": "Y" if len(match_keyword_dict["common_words"]) > 0 else "N",
                              "match_confidence": match_keyword_dict['score']
                          }
                          json_data.append(json_data_object)
                        
                        # Step-3: highlight and set color coding for impairment keyword common words
                        for match_keyword_dict in all_match_imp_keyword_list:
                          # highlight common words
                          highlight_coords_dict = {}
                          highlight_coords_dict, highlighted_word_list = highlight_common_words(page, match_keyword_dict["common_words_coords"])
                          if coordinate:
                            txt_output_file_name.write("%s\n" % f"|{highlight_coords_dict}|")
                          # prepare json object
                          json_data_object[f"page-{num_page}"] = {"impairment_keyword_coords": f"{highlight_coords_dict}"}
                          json_data.append(json_data_object)

                        txt_output_file_name.write("\n")  # add extra line on every match code

        # build and write json file
        json_data_dump_file = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}.json"
        with open(json_data_dump_file, "w") as json_out_file:
            json.dump(json_data, json_out_file)

        # close log file
        txt_output_file_name.close()
        # create highlighted pdf file
        pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_output.pdf"
        pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

        return pdf_output_file_name, cords_file_name

    def search_icd_code(self, txt_list):
        pdf_page_vocab = {}
        for txt_file in txt_list:
            with open(txt_file, "r") as f:
                page_txt = f.read()

                # check the page that have line number instead of code
                index_page = False
                if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
                    index_page = True

                doc = self.nlp_code10(page_txt)
                code_list = []
                for ent in doc.ents:
                    if index_page:
                        # check the code contain letter "L"
                        if re.search("(L[0-9]+)", ent.text):
                            continue
                        else:
                            code_list.append(ent.text)
                    else:
                        code_list.append(ent.text)

                # code_list = [ent.text for ent in doc.ents if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", ent.text)]
                if len(code_list) != 0:
                    page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
                    pdf_page_vocab[page_number] = list(set(code_list))
                    # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
        return pdf_page_vocab

    def get_best_token_match(self, page_num, p_code, page_obj, code_y2_coord, match_threshold):
        all_match_keyword_list = []
        all_match_imp_keyword_list = []
        # Step 1: reverse code pattern
        reversed_icd_code = reverse_code_pattern(p_code)
        # Step 2: fetch keyword based on code
        keyword = self.get_keyword(reversed_icd_code)
        # Step 3: get code paragraph
        code_paragraph_list = self.get_paragraph(page_obj, code_y2_coord)
        # print(f"code_paragraph-2: {code_paragraph}")
        # Step 4: get common words
        for code_paragraph in code_paragraph_list:
            code_para_y2_coords_list = []
            match_keyword_dict = {}
            common_words = self.get_common_words(keyword, code_paragraph)
            if len(common_words) > 0:
              match_keyword_dict["common_words"] = common_words
              # Step 4: get best token match ratio
              clean_paragraph = " ".join(self.clean_text(code_paragraph))
              match_keyword_dict["paragraph"] = clean_paragraph
              match_score = fuzz.token_set_ratio(keyword, clean_paragraph) if fuzz.token_set_ratio(keyword,
                                                                                                    clean_paragraph) > match_threshold else 0
              match_keyword_dict["score"] = match_score
              # Step 5: build common words coordinate dict
              common_words_coord_dict = {}
              for common_word in common_words:
                  highlight_list = page_obj.search_for(common_word)
                  for highlight in highlight_list:
                      # get common word y2 coord value
                      common_word_y2_coords = highlight[3]
                      if (code_y2_coord - 20) <= common_word_y2_coords <= (code_y2_coord + 20):
                          common_words_coord_dict[common_word] = highlight
                          code_para_y2_coords_list.append(common_word_y2_coords)
              match_keyword_dict["common_words_coords"] = common_words_coord_dict
              all_match_keyword_list.append(match_keyword_dict)
            else:
              for keyword_impairment_list in self.impairment_keyword_dict[page_num]:
                for keyword_impairment in keyword_impairment_list:
                  common_words = self.get_common_words(keyword_impairment, code_paragraph)
                  imp_common_words_coord_dict = {}
                  for common_word in common_words:
                    highlight_list = page_obj.search_for(common_word)
                    for highlight in highlight_list:
                      # get common word y2 coord value
                      common_word_y2_coords = highlight[3]
                      if common_word_y2_coords not in code_para_y2_coords_list:
                        imp_common_words_coord_dict[common_word] = highlight
                  match_keyword_dict["imp_common_keyword_coords"] = imp_common_words_coord_dict
                all_match_imp_keyword_list.append(match_keyword_dict)

        return all_match_keyword_list, all_match_imp_keyword_list

    def get_keyword(self, p_code):
        keyword = ""
        # reverse code if required
        code = reverse_code_pattern(p_code)
        # get keyword from dataset
        keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
        if len(keyword_list) > 0:
            keyword = keyword_list[0]
        return keyword

    def get_paragraph(self, page_obj, code_y2_coord):
        code_paragraph_list = []
        page_content = page_obj.get_text("blocks", sort=False)
        for content in page_content:
            if content[1] <= code_y2_coord <= content[3]:
                if len(self.clean_text(content[4])) > 0:
                    code_paragraph = content[4]
                    code_paragraph_list.append(code_paragraph)
        return code_paragraph_list

    def get_common_words(self, sent1, sent2):
        clean_token1 = self.clean_text(sent1)
        clean_token2 = self.clean_text(sent2)
        token_set1 = set(clean_token1)
        token_set2 = set(clean_token2)

        common_word_set = set()

        def get_common(token_set1, token_set2):
            for w1 in token_set1:
                for w2 in token_set2:
                    if w1.lower() == w2.lower():
                        common_word_set.add(w1)

        get_common(token_set1, token_set2)
        get_common(token_set2, token_set1)
        return list(common_word_set)

    def clean_text(self, sent):
        # tokenize sentence
        sent1 = word_tokenize(sent)
        # filter stop words
        filtered_sent = [w for w in sent1 if not w.lower() in self.stop_words]
        filtered_sent = [w for w in filtered_sent if re.sub(re.compile('\W'), '', w)]
        clean_tokens = []
        for token in filtered_sent:
            if token.find("-"):
                tokens = token.split("-")
                clean_tokens.extend(tokens)
            else:
                clean_tokens.append(token)
        return clean_tokens


def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


def reverse_code_pattern(p_code):
    orig_code = p_code

    # check for code contains space(" ")
    tmp_code = orig_code.split(" ")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains dot(".")
    tmp_code = p_code.split(".")
    if len(tmp_code) > 1:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

    # check for code contains comma(",")
    tmp_code = p_code.split(",")
    if len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
    elif len(tmp_code) == 2:
        orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

    # handle if the first char of code is missing
    alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
    for key, val in alphabats.items():
        # replcae char on 0 index
        if orig_code.find(val) == 0:
            # orig_code = orig_code.replace(val, key)
            orig_code = replacer(orig_code, key, 0)
        # replcae char on 1 index
        if orig_code.find(key) == 1:
            orig_code = replacer(orig_code, val, 1)
            # replcae char on 2 index
            if orig_code.find(key) == 2:
                orig_code = replacer(orig_code, val, 2)
            break

    return orig_code


def replacer(s, newstring, index, nofail=False):
    # raise an error if index is outside of the string
    if not nofail and index not in range(len(s)):
        raise ValueError("index outside given string")

    # if not erroring, but the index is still not in the correct range..
    if index < 0:  # add it to the beginning
        return newstring + s
    if index > len(s):  # add it to the end
        return s + newstring

    # insert the new string between "slices" of the original
    return s[:index] + newstring + s[index + 1:]


def create_directory(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [None]:
class SentenceExtractor:
  def __init__(self):
      self.MAX_WORKERS = 20

  def get_json_array_list(self, text_path):
      json_arr = None
      try:
          # print(f"Running '{text_path}'")
          json_arr = call(text_path)
          # print(f"Got json for '{text_path}'")
      except Exception as err:
          print(f"Error for file[{text_path}] is:\n{err}")
      return json_arr

  def get_wrong_keyword_dict(self, text_files_list, with_thread=False, with_process=False):
      def get_sorted_dict(p_json_arr_list):
          wrong_keyword_dict = {
              idx: set([list(element.values())[0] for element in json_arr if json_arr])
              for idx, json_arr in enumerate(p_json_arr_list)
          }
          return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

      if with_thread:
          # take care so that unnecessary thread should not be created
          workers = min(self.MAX_WORKERS, len(text_files_list))
          with futures.ThreadPoolExecutor(max_workers=workers) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      if with_process:
          with futures.ProcessPoolExecutor(max_workers=4) as executor:
              json_arr_list = executor.map(self.get_json_array_list, text_files_list)
          return get_sorted_dict(json_arr_list)
      else:
          json_arr_list = list(map(self.get_json_array_list, text_files_list))
          tmp_wrong_keyword_dict = {
              idx: set([list(element.values())[0] for element in json_arr if json_arr is not None])
              for idx, json_arr in enumerate(json_arr_list)
          }
          return tmp_wrong_keyword_dict

  def extract_sentence(self, wrong_keyword_list, sample_text_list):
      match_keyword_dict = {}
      for key, keyword_set in wrong_keyword_list.items():
          match_dicts = {}
          for key_phrase in keyword_set:
              # print(key, key_phrase)
              with open(sample_text_list[key], "r") as f:
                  file_txt = f.read()
              # match_list = re.findall(f"([^\n]*?(?i){key_phrase}[^.]*\.)", file_txt)
              match_list = re.findall(f"([^\n]*{key_phrase}[^\n]*\n)", file_txt)
              if match_list:
                  match_dicts[key_phrase] = [match.replace("\n", "") for match in match_list]
          match_keyword_dict[key] = match_dicts
      return match_keyword_dict

##Keyword Matching & Highlighting 

- Step 1 - Z87.5
- Step 2 - Personal history of complications of pregnancy, childbirth and the puerperium
- Step 3 - Page keyword
- Step 4 - calculate cosine similirity
- Step 5 - "Green" > 60% otherwise "Yellow"

In [None]:
!rm -rf input_files
!mkdir -p input_files
!rm -rf output
!mkdir -p output

In [None]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [None]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"

highlighter = Highlighter()
sent_extractor = SentenceExtractor()

In [None]:
%%time

for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = highlighter.search_icd_code(txt_list)

  # Step-4: Get closet match of ICD-10 keyword
  wrong_keyword_dict = sent_extractor.get_wrong_keyword_dict(txt_list)
  highlighter.set_impairment_keyword_dict(wrong_keyword_dict)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    match_threshold=35,
                                                                    coordinate=True,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

Page-1-268.25, Coordinate: Rect(261.45489501953125, 64.00006103515625, 285.21258544921875, 72.00396728515625)
Page-1-R53.83, Coordinate: Rect(447.8690185546875, 54.60003662109375, 471.3388671875, 63.60443115234375)
Page-1-R03.0, Coordinate: Rect(287.4623718261719, 223.8399658203125, 305.3106994628906, 231.8438720703125)
Page-1-278.9, Coordinate: Rect(360.775390625, 64.00006103515625, 380.4234619140625, 72.00396728515625)
File[output/28page_output.pdf] is saved after highlighting ICD-10 code
Highlighted coordinates are saved into [28page_cords.txt] file.
CPU times: user 360 ms, sys: 13.9 ms, total: 374 ms
Wall time: 374 ms


In [None]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [None]:
!zip output.zip output/*.*