<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/10_icd_code_highliting_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

Just restart the colab environment.

In [1]:
import re
import os
import pandas as pd

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter

from spacy.lang.en import English

In [None]:
!mkdir pdf-files
!mkdir txt-files

In [2]:
# create directory path
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

# create nlp instance
nlp = English()

##Builing code pattern

In [4]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():

    # add default pattern
    patterns.append({"label": "ICD-10", "pattern": row["Code"]})

    # create alternate pattern
    code1 = code2 = code3 = code4 = ""
    code_arr = row["Code"].split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"
      code4 = f"{code_arr[0]} {code_arr[1]}"

    for code_pattern in [code1, code2, code3, code4]:
      patterns.append({"label": "ICD-10", "pattern": code_pattern})
  return patterns

In [5]:
icd_code_nodot_df = pd.read_csv("icd10codes_noDots.csv")

In [6]:
icd_code_withdot_df = pd.read_csv("icd10codes_withDots.csv")

In [7]:
len(icd_code_nodot_df["Code"]), len(icd_code_withdot_df["Code"])

(96745, 96745)

In [9]:
icd_code_df = icd_code_withdot_df.append(icd_code_nodot_df)
icd_code_df.to_csv("icd_10_codes-v2.csv", index=False)

In [10]:
len(icd_code_df["Code"]), len(icd_code_nodot_df["Code"]) + len(icd_code_withdot_df["Code"])

(193490, 193490)

In [11]:
icd_code_df = icd_code_df.drop_duplicates()
len(icd_code_df)

191576

In [None]:
del icd_code_df
del icd_code_nodot_df
del icd_code_withdot_df

In [18]:
icd_code_v2_df = pd.read_csv("icd_10_codes-v2.csv")
icd_code_v2_df = icd_code_v2_df.drop_duplicates()
patterns = make_icd_10_code_pattern(icd_code_v2_df)

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save to json file
ruler.to_disk("./icd10_code_patterns-v2.jsonl")

In [None]:
nlp.remove_pipe("entity_ruler")
del icd_code_v2_df

##Highlighting code

In [3]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
    inputpdf = PdfFileReader(pdf_in_file)
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(page))
    with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
        output.write(outputStream)
        pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)

    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    code4 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3, code4]
  else:
    return icd_10_code


def highlight_icd10_code(pdf_page_dict: dict, pdf_file_name: str):
  pdf_file = fitz.open(pdf_file_name)

  def highlight_pdf(highlight):
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      highlight.update()
      highlight = page.search_for(text_to_be_highlighted)
      # print(f"Page-{page_num}: ", code, highlight, end='\n')

  for page_num, page in enumerate(pdf_file):
    if page_num in pdf_page_dict:
      for code in pdf_page_dict[page_num]:
        text_to_be_highlighted = code
        highlight = page.search_for(text_to_be_highlighted)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            text_to_be_highlighted = alt_code
            highlight = page.search_for(text_to_be_highlighted)
            # highlight pdf for option pattern
            highlight_pdf(highlight)
        # highlight pdf for main pattern
        highlight_pdf(highlight)

  output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)
  return output_pdf_file_name


def search_icd_10_code(txt_list):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [4]:
# Step-1: splitting pdf file
pdf_file_name = "9929_final_output.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v2.jsonl")

# Step-4: Searching ICD-10 code
pdf_page_vocab = search_icd_10_code(txt_list)

# Step-5: Highlighting ICD-10 code into pdf
output_file_name = highlight_icd10_code(pdf_page_vocab, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code")

File[9929_final_output_output.pdf] is saved after highlighting ICD-10 code


In [11]:
nlp.remove_pipe("entity_ruler")

('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7f8f72cf03c0>)