<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/icd10-code-highlighting/13_4_icd_10_9_code_and_keyword_impairment_highlighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [1]:
%%capture

!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!pip install more-itertools
!pip install PyPDF2
!pip install pymupdf==1.22.0
!pip install fuzzywuzzy

In [None]:
import fitz
print(fitz.__doc__)

In [None]:
import more_itertools
print(more_itertools.__version__)

10.1.0


In [None]:
!pip install pikepdf
!apt install ocrmypdf

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!python --version

Python 3.10.12


In [None]:
!rm -rf data
!unzip data.zip

##Import

In [1]:
import re
import os
import shutil
import glob
import json
import logging
import more_itertools
from pathlib import Path

import numpy as np
import pandas as pd

import fitz

from spacy.lang.en import English

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from dataclasses import dataclass
from array import *

from concurrent import futures
from highlighting import Highlighter
from keyword_matcher import KeywordMatcher
from sentence_extractor import SentenceExtractor
from generic_matcher import GenericMatcher
import config as cfg
from utils import *



##PDF Highlighting

In [2]:
!mkdir -p /home/ocreng
!mkdir -p /home/ocreng/ocrhigh
!mkdir -p /home/ocreng/ocrhigh/input
!mkdir -p /home/ocreng/ocrhigh/output
!mkdir -p /home/ocreng/ocrhigh/processed
!mkdir -p /home/ocreng/ocrhigh/pdf-files
!mkdir -p /home/ocreng/ocrhigh/txt-files

In [3]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [4]:
%%time
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = cfg.file_path["INPUT_PATH"]

highlighter = Highlighter(match_threshold=35)
sent_extractor = SentenceExtractor()
generic_matcher = GenericMatcher()

CPU times: user 1min 22s, sys: 11.3 s, total: 1min 33s
Wall time: 1min 35s


```log
CPU times: user 1min 7s, sys: 6.54 s, total: 1min 14s
Wall time: 1min 15s
```

##Test

In [21]:
!rm -rf /home/ocreng/ocrhigh/input
!mkdir -p /home/ocreng/ocrhigh/input

In [22]:
!cp Practice_Copy_Test_OCR.pdf /home/ocreng/ocrhigh/input
#!cp quinn_ian_michael_5353_1368260963_awp_30a71784-41b8-423d-9d29-8f338f050398.pdf /home/ocreng/ocrhigh/input

In [23]:
!rm -rf /home/ocreng/ocrhigh/output
!mkdir -p /home/ocreng/ocrhigh/output
!rm -rf output*.zip

In [None]:
%%time
text_list = None
icd10_code_dict1 = None
icd9_code_dict1 = None
wrong_keyword_dict = None
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
    print(f"#####################Processing File[{pdf_file}] for highlighting###########################")
    pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"

    # Step-1: splitting pdf file
    print("Step-1: Splitting pdf file............")
    pdf_list = split_pdf(highlighter.PDF_FILES_PATH, pdf_file_name)

    # Step-2: Extracting text from pdf
    print("Step-2: Extracting text from pdf............")
    txt_list = extract_text_from_pdf(highlighter.PDF_FILES_PATH, highlighter.TXT_FILES_PATH, pdf_list)
    highlighter.set_text_list(txt_list)

    # Step-3: Searching ICD-10 cod
    icd10_code_dict = None
    if cfg.load_pattern["icd_code10"]:
      print("Step-3: Searching ICD-10 code into text file..........")
      icd10_code_dict = highlighter.search_icd_code(txt_list)

    # Step-4: Searching ICD-9 code
    icd9_code_dict = None
    if cfg.load_pattern["icd_code9"]:
        print("Step-4: Searching ICD-9 code into text file..........")
        icd9_code_dict = highlighter.search_icd_code(txt_list, code_type="ICD-9")

    # Step-5: Get closet match of keyword impairment
    print("Step-5: Get closet match of keyword impairment..........")
    matched_keyword_dict = sent_extractor.get_matched_keyword_dict(txt_list)
    highlighter.set_impairment_keyword_dict(matched_keyword_dict)

    # Step-6: Get date time list from text file
    print("Step-6: Get date time list from text file..........")
    matched_date_dict = generic_matcher.get_match_date_dict(txt_list)
    matched_number_dict = generic_matcher.get_match_number_dict(txt_list)
    highlighter.set_page_matched_date_dict(matched_date_dict)
    highlighter.set_page_matched_number_dict(matched_number_dict)

    # Step-7: Highlighting ICD-10 code and all keyword impairment into pdf
    print("Step-7: Highlighting ICD-10 code and all keyword impairment into pdf............")
    pdf_output_file, json_code_output_file = highlighter.highlight_icd_code_and_keyword_impairment(icd10_code_dict, pdf_file_name=pdf_file_name)
    print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and all keyword impairment ")
    print(f"Highlighted code and impairment coordinates are saved into [{json_code_output_file}] file.")

    # Step-8: Clean up: move the current file into processed folder
    if Path(f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}").exists():
        # take backup of existing file
        shutil.move(f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}",
                    f"{cfg.file_path['PROCESSED_PATH']}/{pdf_file}_bkp")
        # then move it
        shutil.move(pdf_file_name, cfg.file_path["PROCESSED_PATH"])
    else:
        shutil.move(pdf_file_name, cfg.file_path["PROCESSED_PATH"])

    # remove all pdf and text files
    purge(f"{cfg.file_path['TMP_PDF_FILES_PATH']}/*.pdf")
    purge(f"{cfg.file_path['TMP_TXT_FILES_PATH']}/*.txt")
    pdf_list = []
    txt_list = []

In [25]:
!zip output.zip /home/ocreng/ocrhigh/output/*.*

  adding: home/ocreng/ocrhigh/output/Practice_Copy_Test_OCR_output.json (deflated 80%)
  adding: home/ocreng/ocrhigh/output/Practice_Copy_Test_OCR_output.pdf (deflated 4%)


In [26]:
#matched_date_dict

In [27]:
#matched_number_dict[0]

In [28]:
#matched_keyword_dict[0]

##Regex Issue

In [30]:
found_num = "8%"
term = "%"

matches = False

if term == "%":
  if term == "%" and found_num.strip() == "%":
      matches = True
  elif found_num.strip().endswith("%"):
      matches = False
  print(f"matches: {matches}")

matches: False


In [28]:
term = "98%"
found_num = "98%"
matches = len(re.findall(
        rf"(?<![/0-9a-zA-Z:])([><\-.]?{re.escape(term.strip())})(?:mg|lb|lbs|kg|ml|g)?(?![/0-9a-zA-Z:])[><\-.]?",
        f"{found_num.strip()}")) > 0
matches

True

##SynId Issue

In [None]:
synid_df = highlighter.synid_df

keyword_impairment = "E-cigarette"
data_df = synid_df.loc[synid_df["Short_Description"].str.strip() == keyword_impairment.strip()]
if not data_df.empty and data_df[:1]["Category_1"].str.lower().item() == "Abbreviations".lower():
  synid_list = list(data_df["SynId"])
  print(synid_list)

In [None]:
data_df

Unnamed: 0,SynId,Short_Description,Category_1


In [None]:
data_df[:1]["Category_1"].str.lower().item()

'impairments'

In [None]:
if data_df.empty or data_df[:1]["Category_1"].str.lower().item() != "Abbreviations".lower():
  synid_list = list(synid_df.loc[synid_df["Short_Description"].str.lower().str.strip() == keyword_impairment.lower()]["SynId"])
synid_list

['KW20880']

##Impairment Comparihension

In [None]:
json_arr_list = list(map(sent_extractor.get_json_array_list, text_list))
tmp_wrong_keyword_dict = {
    idx: set([list(element.keys())[0] for element in json_arr if json_arr is not None])
    for idx, json_arr in enumerate(json_arr_list)
}

In [None]:
json_arr_list

In [None]:
doc_keyword_dict = {
    idx:  {{k : v for k, v in page_keyword_dict.items()} for page_keyword_dict in json_arr}
    for idx, json_arr in enumerate(json_arr_list) if json_arr is not None
}

In [None]:
doc_keyword_dict[5]

[{'O2Sat': 'O2Sat'},
 {'BP:': 'BP:'},
 {'T:': 'T:'},
 {'BMI': 'BMI'},
 {'Ht:': 'Ht:'},
 {'tdap': 'tdap'},
 {'Ht': 'Ht'},
 {'%': '%'},
 {'O2Sat:': 'O2Sat:'},
 {'DOB': 'DOB'},
 {'NKDA': 'NKDA'},
 {'BP': 'BP'},
 {'covid': 'covid'},
 {'No medications reported by patient': 'no medications reported by patient'},
 {'diopathic osteoarthritis': 'idiopathic osteoarthritis'},
 {'Sleep pattern disturbance': 'sleep pattern disturbance'},
 {'OBSTETRICS & GYNECOLOGY': 'Obstetrics & Gynecology'},
 {'No medications reported': 'no medications reported'},
 {'Primary Care Provider': 'primary care provider'},
 {'Mixed hyperlipidemia': 'mixed hyperlipidemia'},
 {'Multinodular goiter': 'multinodular goiter'},
 {'Chronic neck pain': 'chronic neck pain'},
 {'Body mass index': 'body mass index'},
 {'Physical Exam': 'physical exam'},
 {'overweight': 'overweight'},
 {'Encounter': 'encounter'},
 {'Allergies': 'allergies'},
 {'Vaccines': 'vaccines'},
 {'shingles': 'shingles'},
 {'Headache': 'headache'},
 {'Patient'

In [None]:
doc_keyword_dict = {}
for idx, json_arr in enumerate(json_arr_list):
  if json_arr is not None:
    page_keyword_dict = {}
    for element in json_arr:
      for k, v in element.items():
        page_keyword_dict[k] = v
    doc_keyword_dict[idx] = page_keyword_dict
doc_keyword_dict[0]

{'DOB': 'DOB',
 'Policy Number': 'Policy Number',
 'Patient': 'patient',
 'Name': 'Name'}

In [None]:
doc_keyword_dict[5]

{'BMI': 'BMI',
 'O2Sat:': 'O2Sat:',
 'NKDA': 'NKDA',
 'tdap': 'tdap',
 'Ht:': 'Ht:',
 '%': '%',
 'BP:': 'BP:',
 'BP': 'BP',
 'O2Sat': 'O2Sat',
 'T:': 'T:',
 'Ht': 'Ht',
 'DOB': 'DOB',
 'covid': 'covid',
 'No medications reported by patient': 'no medications reported by patient',
 'diopathic osteoarthritis': 'idiopathic osteoarthritis',
 'Sleep pattern disturbance': 'sleep pattern disturbance',
 'OBSTETRICS & GYNECOLOGY': 'Obstetrics & Gynecology',
 'No medications reported': 'no medications reported',
 'Primary Care Provider': 'primary care provider',
 'Mixed hyperlipidemia': 'mixed hyperlipidemia',
 'Multinodular goiter': 'multinodular goiter',
 'Chronic neck pain': 'chronic neck pain',
 'Body mass index': 'body mass index',
 'Physical Exam': 'physical exam',
 'overweight': 'overweight',
 'Encounter': 'encounter',
 'Allergies': 'allergies',
 'Vaccines': 'vaccines',
 'shingles': 'shingles',
 'Headache': 'headache',
 'Patient': 'patient',
 'patient': 'patient',
 'Female': 'female',
 'Pu

In [None]:
keyword_dict = doc_keyword_dict[5]
for keyword in keyword_dict.keys():
  print(keyword_dict[keyword])

BMI
O2Sat:
NKDA
tdap
Ht:
%
BP:
BP
O2Sat
T:
Ht
DOB
covid
no medications reported by patient
idiopathic osteoarthritis
sleep pattern disturbance
Obstetrics & Gynecology
no medications reported
primary care provider
mixed hyperlipidemia
multinodular goiter
chronic neck pain
body mass index
physical exam
overweight
encounter
allergies
vaccines
shingles
headache
patient
patient
female
pulse
Name
Appt
flu
Wt


In [None]:
doc_keyword_dict["BMI"]

'BMI'

In [None]:
json_arr_list[:3]

[[{'DOB': 'DOB'},
  {'Policy Number': 'Policy Number'},
  {'Patient': 'patient'},
  {'Name': 'Name'}],
 [],
 [{'US': 'US'},
  {'EKG': 'EKG'},
  {'progress notes': 'progress notes'},
  {'discharge': 'discharge'},
  {'Patient': 'patient'},
  {'patient': 'patient'}]]

In [None]:
wrong_keyword_dict[5]

{'O2Sat': 'O2Sat',
 'BP:': 'BP:',
 'covid': 'covid',
 'Ht:': 'Ht:',
 'NKDA': 'NKDA',
 'tdap': 'tdap',
 'T:': 'T:',
 'BP': 'BP',
 'O2Sat:': 'O2Sat:',
 'Ht': 'Ht',
 'DOB': 'DOB',
 '%': '%',
 'BMI': 'BMI',
 'No medications reported by patient': 'no medications reported by patient',
 'diopathic osteoarthritis': 'idiopathic osteoarthritis',
 'Sleep pattern disturbance': 'sleep pattern disturbance',
 'OBSTETRICS & GYNECOLOGY': 'Obstetrics & Gynecology',
 'No medications reported': 'no medications reported',
 'Primary Care Provider': 'primary care provider',
 'Mixed hyperlipidemia': 'mixed hyperlipidemia',
 'Multinodular goiter': 'multinodular goiter',
 'Chronic neck pain': 'chronic neck pain',
 'Body mass index': 'body mass index',
 'hyperlipidemia': 'hyperlipidemia',
 'osteoarthritis': 'osteoarthritis',
 'Physical Exam': 'physical exam',
 'overweight': 'overweight',
 'Encounter': 'encounter',
 'Allergies': 'allergies',
 'neck pain': 'neck pain',
 'Vaccines': 'vaccines',
 'shingles': 'shingl

In [None]:
#icd9_code_dict1[19]

##Split TXT/PDF

In [None]:
!cp 5372500_input.pdf /home/ocreng/ocrhigh/input

In [None]:
INPUT_PDF_FILES_PATH = "/home/ocreng/ocrhigh/input"
for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"

  # Step-1: splitting pdf file
  print("Step-1: Splitting pdf file............")
  pdf_list = split_pdf(highlighter.PDF_FILES_PATH, pdf_file_name)

  # Step-2: Extracting text from pdf
  print("Step-2: Extracting text from pdf............")
  txt_list = extract_text_from_pdf(highlighter.PDF_FILES_PATH, highlighter.TXT_FILES_PATH, pdf_list)

Step-1: Splitting pdf file............
Step-2: Extracting text from pdf............


In [None]:
!zip text_files.zip /home/ocreng/ocrhigh/txt-files/*.*
!zip pdf_files.zip /home/ocreng/ocrhigh/pdf-files/*.*

  adding: home/ocreng/ocrhigh/txt-files/page-0.txt (deflated 39%)
  adding: home/ocreng/ocrhigh/pdf-files/page-0.pdf (deflated 7%)


In [None]:
#!unzip pdf_files.zip

In [None]:
#!cp /home/ocreng/ocrhigh/pdf-files/page-38.pdf .

##PDF OCRing

In [5]:
import pikepdf

def remove_password_from_pdf(input_file, output_file, password=None):
    pdf = pikepdf.open(input_file, password=password)
    pdf.save(output_file)

In [None]:
remove_password_from_pdf("sodapdf-converted1.pdf",
                         "bin1_image0.pdf", "synodex")

In [21]:
!ocrmypdf --skip-text TESTING_SG_1873824_RP11247368_20230909141157_4testpages_Redacted.pdf TESTING_SG_1873824_RP11247368_20230909141157_4testpages_Redacted1.pdf

Scanning contents:   0% 0/4 [00:00<?, ?page/s]Scanning contents: 100% 4/4 [00:00<00:00, 135.20page/s]
Start processing 2 pages concurrently
OCR: 100% 4.0/4.0 [00:09<00:00,  2.33s/page]
Postprocessing...
PDF/A conversion: 100% 4/4 [00:00<00:00,  4.37page/s]
Recompressing JPEGs: 0image [00:00, ?image/s]
Deflating JPEGs: 100% 4/4 [00:00<00:00, 106.01image/s]
JBIG2: 0item [00:00, ?item/s]
Optimize ratio: 1.25 savings: 19.8%
Output file is a PDF/A-2B (as expected)


In [None]:
!rm -rf Keyword_Sample_File4_Redacted_Redacted_pw_removed.pdf

##Highlight Test

In [None]:
pdf_file_name = "page-30.pdf"
pdf_file = fitz.open(pdf_file_name)
for page_num, page in enumerate(pdf_file):
  for keyword_impairment in wrong_keyword_dict1[0]:
    # print(f"keyword_impairment11: {keyword_impairment}")
    highlight = page.search_for(keyword_impairment)
    print(f"keyword_impairment: {keyword_impairment}, coords: {highlight}")
    highlight = page.add_highlight_annot(highlight)
    highlight.update()
output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)

keyword_impairment: Urea, coords: [Rect(70.31328582763672, 435.0400390625, 83.75199890136719, 443.0439453125)]
keyword_impairment: Comprehensive Metabolic Panel, coords: [Rect(60.47419357299805, 207.52001953125, 118.06912994384766, 215.52392578125), Rect(121.66831970214844, 207.52001953125, 156.94491577148438, 215.52392578125), Rect(159.82464599609375, 207.52001953125, 176.2230682373047, 215.52392578125), Rect(29.997119903564453, 282.8800048828125, 95.99125671386719, 290.8839111328125), Rect(99.83040618896484, 282.8800048828125, 141.58639526367188, 290.8839111328125), Rect(144.46612548828125, 282.8800048828125, 163.6642608642578, 290.8839111328125)]
keyword_impairment: ALT, coords: [Rect(148.30589294433594, 665.4400634765625, 158.38491821289062, 673.4439697265625)]
keyword_impairment: GFR, coords: [Rect(45.355674743652344, 477.0400390625, 57.77448272705078, 485.0439453125), Rect(45.35564422607422, 498.1600341796875, 53.8148307800293, 506.1639404296875)]
keyword_impairment: CBC, coords: