<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/08_icd_code_highliting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

#!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

Just restart the colab environment.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import fitz
import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk

In [2]:
!mkdir pdf-files
!mkdir txt-files

In [8]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

In [2]:
nlp = spacy.load("en_core_web_sm")
nlp = English()

In [3]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list

In [4]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [5]:
def highlight_icd10_code(pdf_page_dict, pdf_file_name):
  pdf_file = fitz.open(pdf_file_name)
  for page_num, page in enumerate(pdf_file):
    if page_num in pdf_page_dict:
      for code in pdf_page_dict[page_num]:
        text_to_be_highlighted = code
        highlight = page.search_for(text_to_be_highlighted)

        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.update()
          highlight = page.search_for(text_to_be_highlighted)
          #print(code, highlight,end='\n')
  pdf_file.save(f"{pdf_file_name.split('.')[0]}_output.pdf", garbage=4, deflate=True, clean=True)

In [6]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():

    # add default pattern
    patterns.append({"label": "ICD-10", "pattern": row["ICD-10"]})

    # create alternate pattern
    code_arr = row["ICD-10"].split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"

    for code_pattern in [code1, code2, code3]:
      patterns.append({"label": "ICD-10", "pattern": code_pattern})
  return patterns

In [None]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():
    patterns.append({"label": "ICD-10", "pattern": row["ICD-10"]})
  return patterns

##Spacy entity rule-matcher

In [22]:
code = "A00.0"
code_arr = code.split(".")
code_arr

['A00', '0']

In [23]:
len(code_arr)

2

In [9]:
code1 = f"{code_arr[0]}. {code_arr[1]}"
code1

'A00. 0'

In [10]:
code2 = f"{code_arr[0]} .{code_arr[1]}"
code2

'A00 .0'

In [11]:
code3 = f"{code_arr[0]} . {code_arr[1]}"
code3

'A00 . 0'

In [15]:
for code in [code1, code2, code3]:
  print(code)

A00. 0
A00 .0
A00 . 0


In [21]:
icd_code_df = pd.read_csv("icd_10_codes.csv")
patterns = make_icd_10_code_pattern(icd_code_df)

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save to json file
ruler.to_disk("./icd10_code_patterns.jsonl")

##Text extraction

In [9]:
# Step-1: spliting pdf file
pdf_file_name = "Redacted_Sample_2.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7fe02ff2d320>

##ICD-10 code searching

In [None]:
with open(f"{txt_files_path}/page-2.txt", "r") as f:
  one_txt = f.read()
  print(one_txt)
  doc = nlp(one_txt)
  print([(ent.text, ent.label_) for ent in doc.ents])

In [20]:
True if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", one_txt) else False

True

In [23]:
%%time

pdf_page_dict = {}
# Step-4: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    doc = nlp(f.read())
    code_list = [ent.text for ent in doc.ents]
    if len(code_list) != 0:
      page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
      pdf_page_dict[page_number] = code_list
      print(f"Page[{txt_file.split('/')[1]}]: {code_list}")

Page[page-2.txt]: ['L14']
Page[page-3.txt]: ['L14', 'L14', 'L22', 'L22']
Page[page-4.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14']
Page[page-6.txt]: ['L14', 'L14']
Page[page-7.txt]: ['L14', 'L14', 'L14', 'L14']
Page[page-8.txt]: ['L26', 'L26', 'L26']
Page[page-9.txt]: ['L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22']
Page[page-10.txt]: ['L22', 'L26', 'L14']
Page[page-11.txt]: ['L22', 'L22', 'L22']
Page[page-12.txt]: ['L14']
Page[page-18.txt]: ['M54.40']
Page[page-19.txt]: ['G43.109']
Page[page-26.txt]: ['D17.1', 'F43.9']
Page[page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[page-74.txt]: ['M54.2', 'R42']
Page[page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[page-84.txt]: ['G43.109']
Page[page-85.txt]: ['L25.9', 'D72.819']
Page[page-102.txt]: ['G43.109', 'R00.2', 'L25.9', 'Z00.00']
Page[p

In [25]:
%%time

pdf_page_dict = {}
# Step-4: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    page_txt = f.read()
    # filter the page that have line number instead of code
    if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
      doc = nlp(page_txt)
      code_list = [ent.text for ent in doc.ents]
      if len(code_list) != 0:
        page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
        pdf_page_dict[page_number] = code_list
        print(f"Page[{txt_file.split('/')[1]}]: {code_list}")

Page[page-18.txt]: ['M54.40']
Page[page-19.txt]: ['G43.109']
Page[page-26.txt]: ['D17.1', 'F43.9']
Page[page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[page-74.txt]: ['M54.2', 'R42']
Page[page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[page-84.txt]: ['G43.109']
Page[page-85.txt]: ['L25.9', 'D72.819']
Page[page-102.txt]: ['G43.109', 'R00.2', 'L25.9', 'Z00.00']
Page[page-105.txt]: ['G43.109', 'R00.2', 'L25.9']
CPU times: user 400 ms, sys: 7.42 ms, total: 408 ms
Wall time: 407 ms


##ICD-10 code highlighting

In [None]:
0 in pdf_page_dict

False

In [None]:
pdf_page_dict[2]

['L14', 'L14']

In [26]:
%%time

# Step-4: Highlighting ICD-10 code into pdf
highlight_icd10_code(pdf_page_dict, pdf_file_name)

CPU times: user 1.27 s, sys: 29.9 ms, total: 1.3 s
Wall time: 1.29 s


In [None]:
True if re.search("[0-9],L", "3, L14") else False

False

In [None]:
True if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", "P 13, L17") else False

True

In [None]:
pattern = re.compile("(P[ ][0-9]+)(,\s)(L[0-9]+)")

for i, line in enumerate(open("txt-files/pdf-page-3.txt")):
  for match in re.finditer(pattern, line):
    print('Found on line %s: %s' % (i+1, match.group()))

Found on line 2: P 60, L23
Found on line 3: P 61, L14
Found on line 4: P 56, L19
Found on line 7: P 54, L19
Found on line 8: P 49, L5
Found on line 9: P 39, L6
Found on line 10: P 37, L9
Found on line 11: P 35, L14
Found on line 12: P 27, L24
Found on line 13: P 21, L23
Found on line 15: P 11, L20
Found on line 16: P 5, L39
Found on line 18: P 3, L24
Found on line 22: P 77, L38
Found on line 27: P 86, L19
Found on line 29: P 66, L19
Found on line 30: P 29, L19
Found on line 31: P 21, L16
Found on line 32: P 12, L18
Found on line 33: P 3, L22
Found on line 34: P 3, L22
Found on line 38: P 3, L17
Found on line 41: P 79, L31


In [None]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    code_list = [ent.text for ent in doc.ents]
    if len(code_list) != 0:
      print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])

Page[pdf-page-2.txt]: ['L14']
Page[pdf-page-3.txt]: ['L14', 'L14', 'L22', 'L22']
Page[pdf-page-4.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14']
Page[pdf-page-6.txt]: ['L14', 'L14']
Page[pdf-page-7.txt]: ['L14', 'L14', 'L14', 'L14']
Page[pdf-page-8.txt]: ['L26', 'L26', 'L26']
Page[pdf-page-9.txt]: ['L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22']
Page[pdf-page-10.txt]: ['L22', 'L26', 'L14']
Page[pdf-page-11.txt]: ['L22', 'L22', 'L22']
Page[pdf-page-12.txt]: ['L14']
Page[pdf-page-18.txt]: ['M54.40']
Page[pdf-page-19.txt]: ['G43.109']
Page[pdf-page-26.txt]: ['D17.1', 'F43.9']
Page[pdf-page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[pdf-page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[pdf-page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[pdf-page-74.txt]: ['M54.2', 'R42']
Page[pdf-page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[pdf-page-84.txt]: ['G43.109']
Page[pdf-page-85.txt]: ['L25.9