<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/07_icd_code_matching_using_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

#!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2

Just restart the colab environment.

In [4]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk

##PDF text extraction

In [27]:
!mkdir pdf-files
!mkdir txt-files

In [28]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/pdf-page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"pdf-page-{page}.pdf")
  return pdf_list

In [29]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/pdf-page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/pdf-page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [None]:
pdf_list = split_pdf("Redacted_Sample_2.pdf")

In [None]:
txt_list = extract_text_from_pdf(pdf_list)

In [None]:
txt_list[:10]

['txt-files/pdf-page-0.txt',
 'txt-files/pdf-page-1.txt',
 'txt-files/pdf-page-2.txt',
 'txt-files/pdf-page-3.txt',
 'txt-files/pdf-page-4.txt',
 'txt-files/pdf-page-5.txt',
 'txt-files/pdf-page-6.txt',
 'txt-files/pdf-page-7.txt',
 'txt-files/pdf-page-8.txt',
 'txt-files/pdf-page-9.txt']

In [None]:
txt_list[:10]

['txt-files/pdf-page-0.txt',
 'txt-files/pdf-page-1.txt',
 'txt-files/pdf-page-2.txt',
 'txt-files/pdf-page-3.txt',
 'txt-files/pdf-page-4.txt',
 'txt-files/pdf-page-5.txt',
 'txt-files/pdf-page-6.txt',
 'txt-files/pdf-page-7.txt',
 'txt-files/pdf-page-8.txt',
 'txt-files/pdf-page-9.txt']

##Spacy entity rule-matcher

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp = English()
ruler = nlp.add_pipe("entity_ruler")

In [None]:
icd_code_df = pd.read_csv("icd_10_codes.csv")
icd_code_df.head()

Unnamed: 0,ICD-10
0,A00.0
1,A00.1
2,A00.9
3,A01.00
4,A01.01


In [6]:
def make_icd_10_code_pattern(icd_10_code_df):
  patterns = []
  for _, row in icd_10_code_df.iterrows():
    patterns.append({"label": "ICD-10", "pattern": row["ICD-10"]})
  return patterns

In [None]:
patterns = make_icd_10_code_pattern(icd_code_df)
patterns[:10]

[{'label': 'ICD-10', 'pattern': 'A00.0'},
 {'label': 'ICD-10', 'pattern': 'A00.1'},
 {'label': 'ICD-10', 'pattern': 'A00.9'},
 {'label': 'ICD-10', 'pattern': 'A01.00'},
 {'label': 'ICD-10', 'pattern': 'A01.01'},
 {'label': 'ICD-10', 'pattern': 'A01.02'},
 {'label': 'ICD-10', 'pattern': 'A01.03'},
 {'label': 'ICD-10', 'pattern': 'A01.04'},
 {'label': 'ICD-10', 'pattern': 'A01.05'},
 {'label': 'ICD-10', 'pattern': 'A01.09'}]

In [None]:
ruler.add_patterns(patterns)

In [None]:
icd_code_df.loc[icd_code_df["ICD-10" == "Z00.0"]]

In [None]:
with open(f"{txt_files_path}/pdf-page-102.txt", "r") as f:
  one_txt = f.read()
  print(one_txt)
  doc = nlp(one_txt)
  print([(ent.text, ent.label_) for ent in doc.ents])

##Performance Testing

In [48]:
my_txt = "txt-files/page-0.txt"
my_txt = my_txt.split("/")[1].split(".")[0]
my_txt

'page-0'

In [50]:
int(my_txt.split("-")[1])

0

In [51]:
pdf_file_name = "Redacted_Sample_2.pdf"
pdf_file_name.split(".")[0]

'Redacted_Sample_2'

In [30]:
nlp = spacy.load("en_core_web_sm")
nlp = English()

In [9]:
icd_code_df = pd.read_csv("icd_10_codes.csv")
patterns = make_icd_10_code_pattern(icd_code_df)

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save to json file
ruler.to_disk("./icd10_code_patterns.jsonl")

In [None]:
%%time

# Step-1: spliting pdf file
pdf_list = split_pdf("Redacted_Sample_2.pdf")

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

# Step-3: loading patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

# Step-4: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    print(f"Page[{txt_file.split('/')[1]}]: ", [(ent.text, ent.label_) for ent in doc.ents])

In [36]:
True if re.search("[0-9],L", "3, L14") else False

False

In [38]:
True if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", "P 13, L17") else False

True

In [41]:
pattern = re.compile("(P[ ][0-9]+)(,\s)(L[0-9]+)")

for i, line in enumerate(open("txt-files/pdf-page-3.txt")):
  for match in re.finditer(pattern, line):
    print('Found on line %s: %s' % (i+1, match.group()))

Found on line 2: P 60, L23
Found on line 3: P 61, L14
Found on line 4: P 56, L19
Found on line 7: P 54, L19
Found on line 8: P 49, L5
Found on line 9: P 39, L6
Found on line 10: P 37, L9
Found on line 11: P 35, L14
Found on line 12: P 27, L24
Found on line 13: P 21, L23
Found on line 15: P 11, L20
Found on line 16: P 5, L39
Found on line 18: P 3, L24
Found on line 22: P 77, L38
Found on line 27: P 86, L19
Found on line 29: P 66, L19
Found on line 30: P 29, L19
Found on line 31: P 21, L16
Found on line 32: P 12, L18
Found on line 33: P 3, L22
Found on line 34: P 3, L22
Found on line 38: P 3, L17
Found on line 41: P 79, L31


In [42]:
my_list = [1, 2, 3, 4]
char_list = ["A", "B", "C", "D"]

for n, ch in zip(my_list, char_list):
  print(n, ch)

1 A
2 B
3 C
4 D


In [32]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    code_list = [ent.text for ent in doc.ents]
    if len(code_list) != 0:
      print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])

Page[pdf-page-2.txt]: ['L14']
Page[pdf-page-3.txt]: ['L14', 'L14', 'L22', 'L22']
Page[pdf-page-4.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14']
Page[pdf-page-6.txt]: ['L14', 'L14']
Page[pdf-page-7.txt]: ['L14', 'L14', 'L14', 'L14']
Page[pdf-page-8.txt]: ['L26', 'L26', 'L26']
Page[pdf-page-9.txt]: ['L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22']
Page[pdf-page-10.txt]: ['L22', 'L26', 'L14']
Page[pdf-page-11.txt]: ['L22', 'L22', 'L22']
Page[pdf-page-12.txt]: ['L14']
Page[pdf-page-18.txt]: ['M54.40']
Page[pdf-page-19.txt]: ['G43.109']
Page[pdf-page-26.txt]: ['D17.1', 'F43.9']
Page[pdf-page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[pdf-page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[pdf-page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[pdf-page-74.txt]: ['M54.2', 'R42']
Page[pdf-page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[pdf-page-84.txt]: ['G43.109']
Page[pdf-page-85.txt]: ['L25.9

In [None]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    print(f"Page[{txt_file.split('/')[1]}]: ", [(ent.text, ent.label_) for ent in doc.ents if ent is not None])
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])
    for ent in doc.ents:
      if ent is not None:
        print(ent.text)

##Whole PDF Keyword Searching

In [7]:
True if re.search("([0-9])(,\s)\s*(L[0-9])", "P 3, L14") else False

True

In [107]:
True if re.match(r"L[0-9]([^\.])[0-9]", "L12.6778") else False

False

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp = English()

In [None]:
# creating patterns jsonl file
icd_code_df = pd.read_csv("icd_10_codes.csv")
patterns = make_icd_10_code_pattern(icd_code_df)

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save jsonl file
ruler.to_disk("./icd10_code_patterns.jsonl")

In [121]:
%%time

# Step-1: Load your PDF
with open("Redacted_Sample_2.pdf", "rb") as f:
  pdf = pdftotext.PDF(f)
pdf_text = "\n\n".join(pdf)

# Step-2: write text into file
with open("ocr-extracted.txt", "w") as f:
  f.write(pdf_text)

# Step-3: loading patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

# Step-4: Searching ICD-10 code
with open(f"ocr-extracted.txt", "r") as f:
  single_txt_file = f.read()

  print(len(single_txt_file))

  doc = nlp(single_txt_file)
  code_list = [ent.text for ent in doc.ents]
  if len(code_list) != 0:
    print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    #print([(ent.text, ent.label_) for ent in doc.ents])

726738
Page[pdf-page-136.txt]: ['L14', 'L14', 'L14', 'L22', 'L22', 'L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14', 'L14', 'L14', 'L14', 'L14', 'L14', 'L14', 'L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22', 'L22', 'L26', 'L14', 'L22', 'L22', 'L22', 'L14', 'M54.40', 'G43.109', 'D17.1', 'F43.9', 'G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2', 'G43.109', 'D72.819', 'R00.2', 'R10.30', 'G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2', 'M54.2', 'R42', 'G43.109', 'R00.2', 'L25.9', 'D72.819', 'G43.109', 'L25.9', 'D72.819', 'G43.109', 'R00.2', 'L25.9', 'Z00.00', 'G43.109', 'R00.2', 'L25.9']
CPU times: user 760 ms, sys: 6.48 ms, total: 766 ms
Wall time: 765 ms
