<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/16_icd_code_correction_and_highliting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF
!pip install textdistance

Just restart the colab environment.

In [14]:
import re
import os
import pandas as pd
import numpy as np
from collections import Counter

import textdistance

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter

from spacy.lang.en import English

In [2]:
!mkdir pdf-files
!mkdir txt-files

In [3]:
# create directory path
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

# create nlp instance
nlp = English()

In [4]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
    inputpdf = PdfFileReader(pdf_in_file)
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(page))
    with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
        output.write(outputStream)
        pdf_list.append(f"page-{page}.pdf")
  return pdf_list


def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)

    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list


def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code


def highlight_icd10_code(pdf_page_dict: dict, pdf_file_name: str):
  pdf_file = fitz.open(pdf_file_name)

  def highlight_pdf(highlight):
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      highlight.update()
      highlight = page.search_for(text_to_be_highlighted)
      # print(f"Page-{page_num}: ", code, highlight, end='\n')

  for page_num, page in enumerate(pdf_file):
    if page_num in pdf_page_dict:
      for code in pdf_page_dict[page_num]:
        text_to_be_highlighted = code
        highlight = page.search_for(text_to_be_highlighted)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            text_to_be_highlighted = alt_code
            highlight = page.search_for(text_to_be_highlighted)
            # highlight pdf for option pattern
            highlight_pdf(highlight)
        # highlight pdf for main pattern
        highlight_pdf(highlight)

  output_pdf_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(output_pdf_file_name, garbage=4, deflate=True, clean=True)
  return output_pdf_file_name


def search_icd_10_code(txt_list):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
  return pdf_page_vocab

In [None]:
nlp.remove_pipe("entity_ruler")

('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7f180606c2d0>)

In [None]:
# Step-1: splitting pdf file
pdf_file_name = "9928_final.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

# Step-4: Searching ICD-10 code
pdf_page_vocab = search_icd_10_code(txt_list)

# Step-5: Highlighting ICD-10 code into pdf
output_file_name = highlight_icd10_code(pdf_page_vocab, pdf_file_name)
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code")

Page[page-1.txt]: ['F41.9', 'F32.9', 'K21.9', 'E78.5', 'M19.90', 'G47.00', 'G62.9', 'R73.03', 'M54.9', 'M48.061', 'F41.9', 'F32.9', 'K21.9', 'E78.5', 'M19.90', 'G47.00', 'G62.9', 'R73.03', 'M54.9', 'M48.061', 'F41.9', 'F32.9', 'K21.9', 'E78.5', 'M19.90', 'G47.00', 'G62.9', 'R73.03', 'M54.9', 'M48.061', 'F41.9', 'F32.9', 'K21.9', 'E78.5', 'M19.90', 'G47.00', 'G62.9', 'R73.03', 'M54.9', 'M48.061']
File[9928_final_output.pdf] is saved after highlighting ICD-10 code


##Code Autocorrection

In [9]:
icd_code_df = pd.read_csv("icd_10_codes.csv")

icd_code = icd_code_df["ICD-10"]
print(len(icd_code_df["ICD-10"]))
icd_code_df["ICD-10"].head()

72750


0     A00.0
1     A00.1
2     A00.9
3    A01.00
4    A01.01
Name: ICD-10, dtype: object

In [10]:
# vocabulary
vocab = set(icd_code)
print(f"The dictionary has {len(vocab)} words.")

The dictionary has 72750 words.


In [15]:
# Let's calculate the frequency of those words
word_freq_dict = {}
word_freq_dict = Counter(icd_code)
print(word_freq_dict.most_common()[0:10])

[('A00.0', 1), ('A00.1', 1), ('A00.9', 1), ('A01.00', 1), ('A01.01', 1), ('A01.02', 1), ('A01.03', 1), ('A01.04', 1), ('A01.05', 1), ('A01.09', 1)]


In [16]:
# get the probability of occurrence of each word 
probs = {}
total = sum(word_freq_dict.values())
for k in word_freq_dict.keys():
  probs[k] = word_freq_dict[k] / total

In [23]:
def get_autocorrect(word):
  input_word = word.lower()
  if input_word in vocab:
    print("Your word seems to be correct")
  else:
    similarities = [1 - (textdistance.Levenshtein(qval=1).distance(v, input_word)) for v in word_freq_dict.keys()]
    df = pd.DataFrame.from_dict(probs, orient="index").reset_index()
    df = df.rename(columns={"index": "word", 0: "Prob"})
    df["Similarity"] = similarities
    output = df.sort_values(["Similarity", "Prob"], ascending=False).head()
    print(max(df["Similarity"]))
    return output

In [24]:
get_autocorrect("276.89")

0


Unnamed: 0,word,Prob,Similarity
11025,K76.89,1.4e-05,0
19092,N76.89,1.4e-05,0
72255,Z76.89,1.4e-05,0
60,A06.89,1.4e-05,-1
170,A27.89,1.4e-05,-1


In [25]:
get_autocorrect("285.42")

0


Unnamed: 0,word,Prob,Similarity
72379,Z85.42,1.4e-05,0
526,A85.2,1.4e-05,-1
985,B85.2,1.4e-05,-1
987,B85.4,1.4e-05,-1
1796,C81.42,1.4e-05,-1


In [26]:
get_autocorrect("ROY.81")

-2


Unnamed: 0,word,Prob,Similarity
58,A06.81,1.4e-05,-2
88,A17.81,1.4e-05,-2
119,A18.81,1.4e-05,-2
169,A27.81,1.4e-05,-2
194,A32.81,1.4e-05,-2


In [29]:
get_autocorrect("ES5.9")

-1


Unnamed: 0,word,Prob,Similarity
49,A05.9,1.4e-05,-1
85,A15.9,1.4e-05,-1
163,A25.9,1.4e-05,-1
473,A75.9,1.4e-05,-1
558,A95.9,1.4e-05,-1


In [31]:
get_autocorrect("286.59")

0


Unnamed: 0,word,Prob,Similarity
17688,M86.59,1.4e-05,0
72439,Z86.59,1.4e-05,0
1883,C82.59,1.4e-05,-1
1953,C83.59,1.4e-05,-1
2110,C86.5,1.4e-05,-1
