<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/13_icd_10_code_highlight_with_keyword_match_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [None]:
import numpy as np
import pandas as pd
import re
import os
import sys
import glob
import difflib
import pickle
from pathlib import Path
from difflib import SequenceMatcher

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [2]:
stop_words = set(stopwords.words('english'))

In [5]:
!mkdir -p input_files

##Core Classes

In [3]:
class Highlighter:
  def __init__(self, code_df):
    # loading and updating patterns for ICD-10 code
    self.nlp_code10 = English()
    self.nlp_code10.add_pipe("entity_ruler").from_disk("icd10_code_patterns-v5.jsonl")

    # define icd-10 code dataset
    self.code_df = code_df
    self.text_list = None

    # define required directory path
    self.PDF_FILES_PATH = "pdf-files"
    self.TXT_FILES_PATH = "txt-files"
    self.OUTPUT_FILES_PATH = "output"
    create_directory(self.PDF_FILES_PATH)
    create_directory(self.TXT_FILES_PATH)
    create_directory(self.OUTPUT_FILES_PATH)

  def split_pdf(self, pdf_path):
      pdf_in_file = open(pdf_path, "rb")
      pdf = PdfReader(pdf_in_file)
      pdf_list = []
      for page in range(len(pdf.pages)):
          input_pdf = PdfReader(pdf_in_file)
          output = PdfWriter()
          #output.addPage(input_pdf.getPage(page))
          output.add_page(input_pdf.pages[page])
          with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
              output.write(outputStream)
              pdf_list.append(f"page-{page}.pdf")
      return pdf_list

  def extract_text_from_pdf(self, pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
      with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
        pdf = pdftotext.PDF(f)

      # Read all the text into one string
      pdf_text = "\n\n".join(pdf)

      # write text into file
      with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
        f.write(pdf_text)
      txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
      i += 1
    self.text_list = txt_file_list
    return txt_file_list

  def highlight_icd_code(self, icd10_code_dict, w_ratio=False, match_threshold=30, coordinate=False, pdf_file_name=None, cords_file_name=None):
      pdf_file = fitz.open(pdf_file_name)
      # create file to write coordinate
      txt_output_file_name = open(f"{self.OUTPUT_FILES_PATH}/{cords_file_name}", "a")
      # add file header 
      txt_output_file_name.write("| Page | Found Code | Actual ICD10-Code | Code Line # | ICD 10 description | Matched Line | Common Words | Matched Line # | confidence |\n ")

      def highlight_code(p_highlight, icd10_code, num_page):
        match_list = []
        sentence_list = []
        line_list = []
        keyword = ""
        # do the color coding
        keyword = self.get_keyword(icd10_code)
        if len(keyword) > 0:
          match_list, sentence_list, line_list = self.get_best_token_match(icd10_code, num_page, w_ratio, match_threshold)
          # highlight code if threshold is more than 30
          if match_list and match_list[0][1] >= match_threshold:
            page_highlight = page.add_highlight_annot(p_highlight)
            page_highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
            page_highlight.update()
        return match_list, sentence_list, line_list, keyword

      def highlight_common_words(page_obj, y_coords_array, curr_y_coord, common_word_list):
        highlight_coords_dict = {}
        #get rect for all words
        page_content = page_obj.get_text("words", sort=False)  
        # find closet y coordinate value
        closet_y_coords = find_nearest(y_coords_array, value=curr_y_coord)
        for content in page_content:
          for common_word in common_word_list:
            curr_word = clean_text(content[4])[0] if len(clean_text(content[4])) > 0 else ""
            if curr_word == common_word and (closet_y_coords-10) <= content[1] <= (closet_y_coords + 10):
              #if content[4].replace("(", "").replace(")", "") == common_word and (closet_y_coords-10) <= content[1] <= (closet_y_coords + 10):
              rect_comp = fitz.Rect(content[0], content[1], content[2], content[3])
              #print(f"Line #{content[6]}-{content[4]} > {rect_comp}")
              highlight = page_obj.add_highlight_annot(rect_comp)
              highlight.set_colors(stroke=[1, 1, 0])
              highlight.update()
              highlight_coords_dict[common_word] = rect_comp
        return highlight_coords_dict
              
      for page_num, page in enumerate(pdf_file):
        # highlight ICD-10 code
        if page_num in icd10_code_dict:
          for code in icd10_code_dict[page_num]:
            highlight_list = page.search_for(code)
            #print(f"Code: {code}, Coordinate: {highlight_list}")
            # prepare list for y0 coord value
            y_coords_list = [highlight[1] for highlight in highlight_list]
            for highlight in highlight_list:
              # highlight ICD-10 code  
              match_list, sentence_list, line_list, keyword = highlight_code(highlight, code, page_num)
              # write all info into text file
              curr_match_score = 0.0
              num_page = page_num + 1
              for idx, match_found in enumerate(match_list):
                # match_found: ('Diagnosis: Encntr for general adult medical exam w/o abnormal findings (Z00.00)', 100)
                max_match_score = match_found[1]
                max_match_sent = match_found[0]
                if max_match_score >= match_threshold:
                  curr_match_score = max_match_score
                  common_words = get_common_words(keyword, max_match_sent)
                  highlight_coords_dict = highlight_common_words(page, y_coords_list, highlight[1], common_words)
                  # | Page | Found Code | Actual ICD10-Code | Code Line # | ICD 10 description | Matched Line| Common Words | Matched Line # | confidence | 
                  code_cors_output = f"|Page-{num_page} | {code} | {reverse_code_pattern(code)} | {line_list} | {keyword if keyword else 'Not available'} | {max_match_sent} | {common_words} | {sentence_list.index(max_match_sent) + 1} | {max_match_score} |"
                  txt_output_file_name.write("%s\n" % code_cors_output)
                  txt_output_file_name.write("%s\n" % f"{highlight_coords_dict}")
              if coordinate and curr_match_score >= match_threshold:
                txt_output_file_name.write("%s\n" % f"|{highlight}|")
              txt_output_file_name.write("\n") # add extra line on every match code

      txt_output_file_name.close()
      pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_output.pdf"
      pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

      return pdf_output_file_name, cords_file_name

  def get_opt_pattern(self, icd_10_code):
    # create alternate pattern
    code_arr = icd_10_code.split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"
      return [code1, code2, code3]
    else:
      return icd_10_code

  def search_icd_code(self, txt_list):
    pdf_page_vocab = {}
    for txt_file in txt_list:
      with open(txt_file, "r") as f:
        page_txt = f.read()

        # check the page that have line number instead of code
        index_page = False
        if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
          index_page = True

        doc = self.nlp_code10(page_txt)
        code_list = []
        for ent in doc.ents:
          if index_page:
            # check the code contain letter "L"
            if re.search("(L[0-9]+)", ent.text):
              continue
            else:
              code_list.append(ent.text)
          else:
            code_list.append(ent.text)

        #code_list = [ent.text for ent in doc.ents if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", ent.text)]
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = list(set(code_list)) 
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    return pdf_page_vocab

  def get_keyword(self, p_code):
    keyword = ""
    # reverse code if required
    code = reverse_code_pattern(p_code)
    # get keyword from dataset
    keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
    if len(keyword_list) > 0:
      keyword = keyword_list[0]
    return keyword

  def get_best_token_match(self, p_code, num_page, w_ratio, match_threshold):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = self.get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    sentence_list = get_sentence_list(self.text_list, num_page)
    # Step 4: get best match token ratio or wratio
    if w_ratio:
      match_list = [
        (sentence, fuzz.WRatio(keyword, sentence)) 
        for sentence in sentence_list if fuzz.WRatio(keyword, sentence) > match_threshold
      ]
    else:
      match_list = [
        (sentence, fuzz.token_set_ratio(keyword, sentence)) 
        for sentence in sentence_list if fuzz.token_set_ratio(keyword, sentence) > match_threshold
      ]
    # Step 5: get sentence line
    line_list = get_sentence_line(p_code, sentence_list)
    return sort_tuple(match_list), sentence_list, line_list

  def get_match_sentence_and_line_data(self, p_code, num_page):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = self.get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    sentence_list = get_sentence_list(self.text_list, num_page)
    # Step 4: get best 3 match ratio 
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    # Step 5: get sentence line
    line_list = get_sentence_line(p_code, sentence_list)
    return match_list, sentence_list, line_list

def find_nearest(array, value):
  array = np.asarray(array)
  idx = (np.abs(array - value)).argmin()
  return array[idx]

def get_common_words(sent1, sent2):
  clean_token1 = clean_text(sent1)
  clean_token2 = clean_text(sent2)
  token_set1 = set(clean_token1)
  token_set2 = set(clean_token2)
  return list(token_set1 & token_set2)
  
def clean_text(sent):
  # tokenize sentence
  sent1 = word_tokenize(sent)
  # filter stop words
  filtered_sent = [w for w in sent1 if not w.lower() in stop_words]
  filtered_sent = [w for w in filtered_sent if re.sub(re.compile('\W'), '', w)]
  return filtered_sent

def get_sentence_line(p_code, sentence_list):
  line_list = [(line + 1) for line, sent in enumerate(sentence_list) if p_code in sent]
  return line_list

def get_sentence_list(text_list, num_page):
  with open(f"{text_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  return sentence_list

def reverse_code_pattern(p_code):
  orig_code = p_code

  # check for code contains space(" ")
  tmp_code = orig_code.split(" ")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

  # check for code contains dot(".")
  tmp_code = p_code.split(".")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  
  # check for code contains comma(",")
  tmp_code = p_code.split(",")
  if len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  elif len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

  # handle if the first char of code is missing
  alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
  for key, val in alphabats.items():
    # replcae char on 0 index
    if orig_code.find(val) == 0:
      #orig_code = orig_code.replace(val, key)
      orig_code = replacer(orig_code, key, 0)
    # replcae char on 1 index
    if orig_code.find(key) == 1:
      orig_code = replacer(orig_code, val, 1)
      # replcae char on 2 index
      if orig_code.find(key) == 2:
        orig_code = replacer(orig_code, val, 2)
      break

  return orig_code

def replacer(s, newstring, index, nofail=False):
  # raise an error if index is outside of the string
  if not nofail and index not in range(len(s)):
      raise ValueError("index outside given string")

  # if not erroring, but the index is still not in the correct range..
  if index < 0:  # add it to the beginning
      return newstring + s
  if index > len(s):  # add it to the end
      return s + newstring

  # insert the new string between "slices" of the original
  return s[:index] + newstring + s[index + 1:]

def sort_tuple(p_tup):
  return (sorted(p_tup, key = lambda x: x[1], reverse=True)) 

def create_directory(dir_name):
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

##Keyword Matching & Highlighting 

- Step 1 - Z87.5
- Step 2 - Personal history of complications of pregnancy, childbirth and the puerperium
- Step 3 - Page keyword
- Step 4 - calculate cosine similirity
- Step 5 - "Green" > 60% otherwise "Yellow"

In [4]:
!rm -rf input_files
!mkdir -p input_files
!rm -rf output
!mkdir -p output

In [5]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [6]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

highlighter = Highlighter(code_df)

In [8]:
%%time

for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = highlighter.search_icd_code(txt_list)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    w_ratio=False,
                                                                    match_threshold=45,
                                                                    coordinate=True,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

Code: E78.3, Coordinate: [Rect(226.35824584960938, 520.47998046875, 243.45660400390625, 528.48388671875)]
Code: Z20.822, Coordinate: [Rect(311.13006591796875, 114.6400146484375, 337.1675720214844, 122.6439208984375)]
Code: R05.9, Coordinate: [Rect(135.0170440673828, 307.5999450683594, 152.56533813476562, 315.6038513183594)]
Code: E78.3, Coordinate: [Rect(226.3582305908203, 610.0399780273438, 243.45651245117188, 617.0433959960938)]
Code: E78.3, Coordinate: [Rect(226.35824584960938, 492.6400146484375, 243.45660400390625, 500.6439208984375)]
Code: Z12.5, Coordinate: [Rect(127.9977035522461, 609.760009765625, 145.24603271484375, 617.763916015625)]
Code: 200.00, Coordinate: [Rect(344.1510009765625, 599.4400024414062, 364.45306396484375, 607.4439086914062)]
Code: R05.9, Coordinate: [Rect(135.01699829101562, 426.8800048828125, 152.56529235839844, 434.8839111328125)]
Code: Z00.00, Coordinate: [Rect(344.9269104003906, 74.320068359375, 365.5649108886719, 82.323974609375), Rect(344.9268798828125,

In [None]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [9]:
!zip output.zip output/*.*

  adding: output/01_final_cords.txt (deflated 72%)
  adding: output/01_final_output.pdf (deflated 9%)
  adding: output/APS_38600000R_final_cords.txt (deflated 92%)
  adding: output/APS_38600000R_final_output.pdf (deflated 2%)


In [None]:
!zip pdf_text_output.zip pdf-files/*.* txt-files/*.*

In [7]:
# Step-1: spliting pdf file
pdf_file_name = "APS_38600000R_final.pdf"
pdf_list = highlighter.split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = highlighter.extract_text_from_pdf(pdf_list)

In [8]:
# Step-3: Searching ICD-10 code
page_code10_dict = highlighter.search_icd_code(txt_list)

In [9]:
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

##Text clean up

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  my_text = f.read()

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(my_text)
sentences = [sentence.text for sentence in doc.sents]

In [None]:
sentences

In [None]:
def text_preprocess(text_file):
  sentence_list = []
  stopwords = ["is", "a"]
  doc = nlp(my_text)
  sentences = [sentence.text for sentence in doc.sents]
  for sent in sentences:
    clean_text = " ".join(sent.split())  # Remove extra spaces, tabs, and line breaks
    clean_text = re.sub(f"[{re.escape(punctuation)}]", "", clean_text) # Remove punctuation
    clean_text = re.sub(r"\b[0-9]+\b\s*", "", clean_text)     # Remove numbers
    clean_text = " ".join([w for w in clean_text.split() if not w.isdigit()]) # Remove digits= Side effect: removes extra spaces
    clean_text = " ".join([w for w in clean_text.split() if w.isalpha()]) # Remove non-alphabetic characters= Side effect: removes extra spaces
    clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", clean_text) # Remove all special characters and punctuation
    # Remove stopwords from a list
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if not t in stopwords]
    clean_text = " ".join(clean_tokens)
    # Remove short tokens
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if len(t) > 3]
    clean_text = " ".join(clean_tokens)
    # Remove repeated characters
    clean_text = re.sub(r'(.)\1{3,}',r'\1', clean_text)
    if len(clean_text) > 0:
      sentence_list.append(clean_text)
  return sentence_list

In [None]:
sentence_list = text_preprocess(my_text)
sentence_list

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
sentence_list

In [None]:
def get_best_match(sentence_list, keyword):
  print("#"*10)
  print(f"Matching for : {keyword}")
  print()
  ratios = [fuzz.ratio(keyword, sentence) for sentence in sentence_list]
  best_match = sentence_list[ratios.index(max(ratios))]
  print(f"Best match: {fuzz.ratio(keyword, best_match)} | {best_match}")
  print(f"Before Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) - 1])} | {sentence_list[sentence_list.index(best_match) - 1]}")
  print(f"After Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) + 1])} | {sentence_list[sentence_list.index(best_match) + 1]}")
  print()

  p_ratios = [fuzz.partial_ratio(keyword, sentence) for sentence in sentence_list]
  p_best_match = sentence_list[p_ratios.index(max(p_ratios))]
  print(f"Partial Best match: {fuzz.partial_ratio(keyword, p_best_match)} | {p_best_match}")
  print(f"Partial Before Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) - 1])} | {sentence_list[sentence_list.index(p_best_match) - 1]}")
  print(f"Partial After Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) + 1])} | {sentence_list[sentence_list.index(p_best_match) + 1]}")
  print()

In [None]:
keyword_list = [get_keyword(code) for code in clean_icd10_code]
keyword_list

In [None]:
[get_best_match(sentence_list, keyword) for keyword in keyword_list]

In [None]:
def get_best_match(sentence_list, keyword_list):
  for keyword in keyword_list:
    #for sentence in sentence_list:
    print("#"*10)
    print(f"Matching for : {keyword}")
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    for match_found in match_list:
      print(f"{match_found[0]} | {match_found[1]}")
    print()

In [None]:
get_best_match(sentence_list, keyword_list)
#process.extract(query, choices, scorer = fuzz.partial_ratio, limit = 2)

In [None]:
sentence_list.index("Moderate Obstructive Sleep Apnea .")

25

In [None]:
for idx, sent in enumerate(sentence_list):
  print(f"{idx}>{sent}")

##All Together

In [10]:
def get_keyword(p_code):
  keyword = ""
   # reverse code if required
  code = reverse_code_pattern(p_code)
  # get keyword from dataset
  keyword_list = list(code_df.loc[code_df["Code"] == code]["Keyword"])
  if len(keyword_list) > 0:
    keyword = keyword_list[0]
  return keyword

def get_best_match(code10_dict, txt_list, num_page):
  # Step 1: reverse code pattern
  clean_icd10_code = [reverse_code_pattern(code) for code in page_code10_dict[num_page]]
  # Step 2: fetch keyword based on code 
  keyword_list = [get_keyword(code) for code in clean_icd10_code]
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks

  # Step 4: get best match 
  for keyword in keyword_list:
    #for sentence in sentence_list:
    print("#"*10)
    print(f"Matching for : {keyword}")
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    print(match_list[0][1])
    for idx, match_found in enumerate(match_list):
      if match_found[1] > 40:
        print(f"{idx}> {match_found[0]} | {match_found[1]}")
    print()

In [None]:
def get_best_match(p_code, num_page):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    with open(f"{txt_list[num_page]}", "r") as f:
      lines = [line.rstrip('\n') for line in f]
      sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
    # Step 4: get best 3 match ratio 
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    return match_list, sentence_list

In [None]:
clean_icd10_code = [reverse_code_pattern(code) for code in page_code10_dict[7]]
clean_icd10_code

['G47.10', 'G47.31', 'R06.83', 'G47.39', 'G47.33', 'G47.8', 'G47.34', 'R06.3']

In [None]:
match_list, sentence_list = get_best_match("G47.10", 7)
match_list

[('(central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10)', 55),
 ('Diplomate in ABPN - Sleep Medicine', 45),
 ('Hypoxemia', 42)]

Finding for `Hypersomnia, unspecified`:
44 > Hypoxemia
65 > (central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10)

In [11]:
with open(f"{txt_list[21]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks

In [12]:
def sort_tuple(tup):
  return(sorted(tup, key = lambda x: x[1], reverse=True)) 

def get_best_token_match(p_code, num_page):
  # Step 1: reverse code pattern
  reversed_icd_code = reverse_code_pattern(p_code)
  # Step 2: fetch keyword based on code 
  keyword = get_keyword(reversed_icd_code)
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  # Step 4: get best token match ratio 
  #match_list = [fuzz.token_set_ratio(keyword, sentence) for sentence in sentence_list]
  print(f"Finding for `{keyword}`:")
  match_list = []
  for sentence in sentence_list:
    token_ratio = fuzz.token_set_ratio(keyword, sentence)
    if token_ratio >= 45:
      print(f"{p_code}: {token_ratio} > {sentence}")
      match_list.extend((sentence, token_ratio))
  print()
  #return match_list

In [13]:
[code for code in page_code10_dict[21]]

['L73.9',
 'Z00.00',
 'D69.6',
 'M54.5',
 'Z68.25',
 '212.5',
 'R10.32',
 'S93.411A',
 'K46.9',
 'R05',
 'R03',
 '278.9',
 'J01.00',
 'R50.9',
 'Z78.9',
 'R53.83',
 'E78.3',
 '268.25',
 '287.5']

In [14]:
[(code, get_keyword(code)) for code in page_code10_dict[21]]

[('L73.9', 'Follicular disorder, unspecified'),
 ('Z00.00', 'Encntr for general adult medical exam w/o abnormal findings'),
 ('D69.6', 'Thrombocytopenia, unspecified'),
 ('M54.5', 'Low back pain'),
 ('Z68.25', 'Body mass index [BMI] 25.0-25.9, adult'),
 ('212.5', 'Encounter for screening for malignant neoplasm of prostate'),
 ('R10.32', 'Left lower quadrant pain'),
 ('S93.411A', 'Sprain of calcaneofibular ligament of right ankle, init'),
 ('K46.9', 'Unspecified abdominal hernia without obstruction or gangrene'),
 ('R05', 'Cough'),
 ('R03', 'Abnormal blood-pressure reading, without diagnosis'),
 ('278.9', 'Other specified health status'),
 ('J01.00', 'Acute maxillary sinusitis, unspecified'),
 ('R50.9', 'Fever, unspecified'),
 ('Z78.9', 'Other specified health status'),
 ('R53.83', 'Other fatigue'),
 ('E78.3', 'Hyperchylomicronemia'),
 ('268.25', 'Body mass index [BMI] 25.0-25.9, adult'),
 ('287.5', 'Personal history of comp of preg, chldbrth and the puerp')]

In [None]:
[get_best_token_match(code, 21) for code in page_code10_dict[21]]

In [16]:
def get_best_token_match(p_code, num_page):
  # Step 1: reverse code pattern
  reversed_icd_code = reverse_code_pattern(p_code)
  # Step 2: fetch keyword based on code 
  keyword = get_keyword(reversed_icd_code)
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  # Step 4: get best token match ratio 
  match_list = [(sentence, fuzz.WRatio(keyword, sentence)) for sentence in sentence_list if fuzz.token_set_ratio(keyword, sentence) > 40]
  return sort_tuple(match_list)

In [17]:
match_list = [get_best_token_match(code, 21) for code in page_code10_dict[21]]

In [None]:
match_list

In [19]:
match_list = get_best_token_match("R50.9", 21)
match_list

[('Diagnosis Fever (R50.9), Sinusitis, acute, maxillary', 86),
 ('Diagnosis Fever (R50.9)', 45)]

In [20]:
match_list[0]

('Diagnosis Fever (R50.9), Sinusitis, acute, maxillary', 86)

In [23]:
match_list[0][1]

86

In [None]:
fuzz.token_set_ratio(get_keyword("E78.3"), "Diagnosis Hypertriglyceridemia, sporadic (E78.3)")

34

In [None]:
fuzz.token_set_ratio(get_keyword("R50.9"), "Diagnosis Fever (R50.9), Sinusitis, acute, maxillary")

45

In [None]:
fuzz.token_set_ratio("Fever, unspecified", "Diagnosis Fever (R50.9), Sinusitis, acute, maxillary")

45

In [None]:
fuzz.token_set_ratio(get_keyword("R50.9"), "Diagnosis Fever (R50.9)")

47

In [None]:
fuzz.WRatio(get_keyword("E78.3"), "Diagnosis Hypertriglyceridemia, sporadic (E78.3)")

50

In [None]:
def get_common_words(sent1, sent2):
  sent1 = set(sent1.replace(",", "").lower().split())
  sent2 = set(sent2.replace(",", "").lower().split())

  return list(sent1 & sent2)

In [14]:
def get_common_words(sent1, sent2):
  clean_token1 = clean_text(sent1)
  clean_token2 = clean_text(sent2)
  token_set1 = set(clean_token1)
  token_set2 = set(clean_token2)
  return list(token_set1 & token_set2)

In [15]:
get_common_words(
  "Collected Date: \nReported Date: \nLab [Canceled, order Expired]: Comprehensive Metabolic Panel (CMP) \nDiagnosis: Enentr for general adult medical exam w/o abnormal findings (Z00.00) \n",
  "Collected Date: \nReported Date: \nLab [Canceled, order Expired]: Comprehensive Metabolic Panel (CMP) \nDiagnosis: Enentr for general adult medical exam w/o abnormal findings (Z00.00) \n"
)

['general',
 'Comprehensive',
 'order',
 'Reported',
 'CMP',
 'exam',
 'Canceled',
 'medical',
 'findings',
 'Collected',
 'adult',
 'Metabolic',
 'Z00.00',
 'abnormal',
 'Diagnosis',
 'w/o',
 'Enentr',
 'Date',
 'Panel',
 'Lab',
 'Expired']

In [8]:
def clean_text(sent):
  # tokenize sentence
  sent1 = word_tokenize(sent)
  # filter stop words
  filtered_sent = [w for w in sent1 if not w.lower() in stop_words]
  filtered_sent = [w for w in filtered_sent if re.sub(re.compile('\W'), '', w)]
  return filtered_sent

In [17]:
# Diagnosis Sprain of calcaneofibular (ligament) of right ankle, initial encounter (S93.411A)
clean_text(
  "Diagnosis Left lower quadrant pain (R10.32), Non-smoker (Z78.9), Hernia (K46.9), Overweight (BMI 25.0 - 29.9), Body mass index (BMI) of 25.0-25.9 in adult (Z68.25), Dietary counseling and surveillance"
)

['Diagnosis',
 'Left',
 'lower',
 'quadrant',
 'pain',
 'R10.32',
 'Non-smoker',
 'Z78.9',
 'Hernia',
 'K46.9',
 'Overweight',
 'BMI',
 '25.0',
 '29.9',
 'Body',
 'mass',
 'index',
 'BMI',
 '25.0-25.9',
 'adult',
 'Z68.25',
 'Dietary',
 'counseling',
 'surveillance']

In [None]:
def find_nearest(array, value):
  array = np.asarray(array)
  idx = (np.abs(array - value)).argmin()
  return array[idx]

In [None]:
array = np.random.random(10)
print(array)

[0.93361323 0.10674056 0.81360362 0.26873547 0.7892294  0.36625687
 0.48473053 0.43495106 0.68754387 0.28034082]


In [None]:
print(find_nearest(array, value=0.35))

0.3662568679192558


##Fitz

In [7]:
def find_nearest(array, value):
  array = np.asarray(array)
  idx = (np.abs(array - value)).argmin()
  return array[idx]

In [10]:
pdf_file = fitz.open("page-11.pdf")  #Create pdf file object
pdf_page_count = pdf_file.page_count   #var to hold page count
for page in range(pdf_page_count):  #notice that page starts with index 0
  page_obj = pdf_file[page] #Create page object
  #content_of_page = pdf_file.get_page_text(page) #Get page content
  match_word = "Z00.00" 
  content_of_page = page_obj.get_text("blocks", sort=False)  #get rect for all words

  coords_array = []
  y_coords_array = []
  for content in content_of_page:
    print(content)
    if content[4].replace("(", "").replace(")", "") == match_word:
      rect_comp = fitz.Rect(content[0],content[1],content[2],content[3])
      print(f"Line #{content[6]}-{content[4]} > {rect_comp}")
      coords_array.append(rect_comp)
      y_coords_array.append(content[1])
      
      #coord = fitz.Rect(content[1]).rect
      print(rect_comp[1])

      highlight = page_obj.add_highlight_annot(rect_comp)
      highlight.set_colors(stroke=[0.2, 1, 0.8])
      highlight.update()

  # find closet y coordinate value
  closet_y_coords = find_nearest(y_coords_array, value=74.320068359375)
  match_word = "Comprehensive" 
  for content in content_of_page:
    # print(word)
    if content[4].replace("(", "").replace(")", "") == match_word and (closet_y_coords-20) <= content[1] <= (closet_y_coords + 20):
      rect_comp = fitz.Rect(content[0],content[1],content[2],content[3])
      print(f"Line #{content[6]}-{content[4]} > {rect_comp}")
      highlight = page_obj.add_highlight_annot(rect_comp)
      highlight.set_colors(stroke=[1, 1, 0])
      highlight.update()

pdf_output_file_name = f"page_11_output.pdf"
pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

(62.63398361206055, 34.760009765625, 191.74160766601562, 41.763427734375, '= = . vap \n. sun , \n', 0, 0)
(62.63398361206055, 44.08001708984375, 372.44427490234375, 82.323974609375, 'Collected Date: \nReported Date: \nLab [Canceled, order Expired]: Comprehensive Metabolic Panel (CMP) \nDiagnosis: Enentr for general adult medical exam w/o abnormal findings (Z00.00) \n', 1, 0)
(65.51370239257812, 113.96002197265625, 402.4411315917969, 120.96343994140625, 'Test Name \nResult \nUnits \n', 2, 0)
(484.9934387207031, 20.32000732421875, 537.5484619140625, 34.32684326171875, '   \n', 3, 0)
(430.5186462402344, 24.44000244140625, 523.3900146484375, 41.763916015625, 'Patient DOB: \nOrdered by: Nicole Khalil \n', 5, 0)
(430.5186462402344, 44.08001708984375, 505.8713684082031, 52.08392333984375, 'Site: Silver Pine Lab \n', 6, 0)
(64.07384490966797, 112.96002197265625, 586.503662109375, 134.8839111328125, 'Normal Range \nStatus \n  \nNo Results Available \n', 8, 0)
(62.63398361206055, 196.47998046875

ValueError: ignored

In [None]:
list1 = [74.320068359375, 237.0400390625, 409.8399658203125, 572.5599975585938]
curr_value = 74.320068359375 + 20
find_nearest(list1, value=curr_value)

74.320068359375

In [None]:
curr_value = 74.320068359375 - 20
find_nearest(list1, value=curr_value)

74.320068359375

In [None]:
def get_word_coordinate(page_obj, common_words):
  word_coords = {}
  content_of_page = page_obj.get_text("words", sort=False)  #get rect for all words
  for content in content_of_page:
    # print(word)
    for idx, common_word in enumerate(common_words):
      word_coord = {}
      curr_word = content[4].replace("(", "").replace(")", "")
      if curr_word == match_word:
        rect_comp = fitz.Rect(content[0],content[1],content[2],content[3])
        word_coord[common_word] = rect_comp
        word_coord["Line#"] = content[6]
        word_coords[idx] = word_coord
  return word_coords

In [None]:
pdf_file = fitz.open("pdf-files/page-11.pdf")  #Create pdf file object
pdf_page_count = pdf_file.page_count   #var to hold page count
for page in range(pdf_page_count):  #notice that page starts with index 0
  page_obj = pdf_file[page] #Create page object
  print(get_word_coordinate(page_obj, ["general", "adult", "medical", "exam"]))

{0: {'general': Rect(242.856689453125, 65.00006103515625, 274.8216857910156, 72.00347900390625), 'Line#': 2}, 1: {'adult': Rect(242.856689453125, 65.00006103515625, 274.8216857910156, 72.00347900390625), 'Line#': 2}, 2: {'medical': Rect(242.856689453125, 65.00006103515625, 274.8216857910156, 72.00347900390625), 'Line#': 2}, 3: {'exam': Rect(242.856689453125, 65.00006103515625, 274.8216857910156, 72.00347900390625), 'Line#': 2}}


In [None]:
def highlight_common_words(page_obj, y_coords_array, curr_y_coord, common_word_list):
  page_content = page_obj.get_text("words", sort=False)  #get rect for all words

  # find closet y coordinate value
  closet_y_coords = find_nearest(y_coords_array, value=curr_y_coord)
  match_word = "Comprehensive" 
  for content in page_content:
    for common_word in common_word_list:
      if content[4].replace("(", "").replace(")", "") == common_word and (closet_y_coords-20) <= content[1] <= (closet_y_coords + 20):
        rect_comp = fitz.Rect(content[0],content[1],content[2],content[3])
        print(f"Line #{content[6]}-{content[4]} > {rect_comp}")
        highlight = page_obj.add_highlight_annot(rect_comp)
        highlight.set_colors(stroke=[1, 1, 0])
        highlight.update()