<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/13_icd_10_code_and_keyword_spell_correction_highliting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

[Fitz Exact Match](https://stackoverflow.com/questions/64536027/selecting-the-exact-match-using-pymupdf-page-searchfor)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

Just restart the colab environment.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os
import glob

import pdb

import fitz
import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.language import Language
from spacy.tokens import Doc

from concurrent import futures

import nltk

In [2]:
!mkdir -p pdf-files
!mkdir -p txt-files
!mkdir -p ocr-pdf-files

In [3]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"

##Define some functions

In [4]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list

In [5]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [6]:
def get_opt_pattern(icd_10_code):
  # create alternate pattern
  code_arr = icd_10_code.split(".")
  if len(code_arr) > 1:
    code1 = f"{code_arr[0]}. {code_arr[1]}"
    code2 = f"{code_arr[0]} .{code_arr[1]}"
    code3 = f"{code_arr[0]} . {code_arr[1]}"
    return [code1, code2, code3]
  else:
    return icd_10_code

In [70]:
def isExactMatch(page, term, clip, fullMatch=False, caseSensitive=False):
  # clip is an item from page.search_for(term, quads=True)
  termLen = len(term)
  termBboxLen = max(clip.height, clip.width)
  termfontSize = termBboxLen/termLen
  f = termfontSize*2

  #clip = clip.rect

  validate = page.get_text("blocks", clip = clip + (-f, -f, f, f), flags=0)[0][4]
  flag = 0
  if not caseSensitive:
      flag = re.IGNORECASE

  matches = len(re.findall(f'{term}', validate, flags=flag)) > 0
  if fullMatch:
      matches = len(re.findall(f'\\b{term}\\b', validate))>0
  return matches

def highlight_icd_code_and_keyword(pdf_code_dict, page_keyword_dict=None, pdf_file_name=None, cords_file_name=None, code_type="ICD-10"):
  pdf_file = fitz.open(pdf_file_name)

  def highlight_pdf(highlight, icd10_code):
    cords_list = []
    for inst in highlight:
      highlight = page.add_highlight_annot(inst)
      if code_type == "ICD-9":
        highlight.set_colors(stroke=[1, 0.5, 0.8]) # light red color (r, g, b)
      highlight.update()
      highlight = page.search_for(icd10_code)
      cords_list.append(highlight)
    code_cors_output = f"Page-{page_num}: {icd10_code} : {cords_list}"
    txt_output_file_name.write("%s\n" % code_cors_output)

  # create file to write cordinate 
  txt_output_file_name = open(cords_file_name, "a")

  for page_num, page in enumerate(pdf_file):

    # highlight code
    if page_num in pdf_code_dict:
      for code in pdf_code_dict[page_num]:
        highlight = page.search_for(code)
        if len(highlight) == 0:
          alternate_code_list = get_opt_pattern(code)
          for alt_code in alternate_code_list:
            highlight = page.search_for(alt_code)
            # highlight pdf for option pattern
            highlight_pdf(highlight, alt_code)
        # highlight pdf for main pattern   
        highlight_pdf(highlight, code)

    # highlight keyword
    if page_keyword_dict is not None:
      if page_num in page_keyword_dict:
        for keyword in page_keyword_dict[page_num]:
          coordinates = page.search_for(keyword)
          #print(f"Keyword: {keyword}, Length: {len(coordinates)}")
          cords_list = []
          for inst in coordinates:
            #print(f"Keyword: {keyword}, inst: {inst}")
            #if isExactMatch(page, keyword, inst, fullMatch=True, caseSensitive=False):
            highlight = page.add_highlight_annot(inst)
            highlight.set_colors(stroke=[1, 0.8, 0.8])
            highlight.update()
            highlight = page.search_for(keyword)
            cords_list.append(highlight)
          keyword_cors_output = f"Page-{page_num}: {keyword} : {cords_list}"
          txt_output_file_name.write("%s\n" % keyword_cors_output)
          #print(f"Page-{page_num}: ", highlight, end='\n')

  txt_output_file_name.close()

  pdf_output_file_name = f"{pdf_file_name.split('.')[0]}_output.pdf"
  pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

  return pdf_output_file_name, cords_file_name

In [None]:
def handle_speacial_chars(doc_str):
  regex = re.compile('[,]')
  if(regex.search(doc_str) != None):
    updated_str = doc_str.replace(",", ".")
  return updated_str

In [None]:
handle_speacial_chars("Decreased white blood cell count, unspecified")

'Decreased white blood cell count. unspecified'

In [None]:
@Language.component("custom_comma_remover")
def remove_comma_from_keyword(doc):
  token_list = []
  for index, token in enumerate(doc):
    # skip the loop if token contains "." or ","
    if token.text == '.' or token.text == ',':
      continue

    # replace comma with space otherwise not
    if "." in token.text:
      token_list.append(token.text.replace(".", ""))
    else:
      token_list.append(token.text)

  return Doc(doc.vocab, words=token_list)

In [None]:
def make_icd_keyword_pattern2(icd_10_keyword_df, nlp=None):

  keywords = [row["Keyword"] for _, row in icd_10_keyword_df.iterrows()]
  
  phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
  patterns = list(nlp.tokenizer.pipe(keywords))
  phrase_matcher.add('keywords', patterns)

  nlp.add_pipe("custom_comma_remover")

  return phrase_matcher

In [None]:
def make_icd_keyword_pattern(icd_10_keyword_df, nlp=None):
  keywords = []
  for _, row in icd_10_keyword_df.iterrows():
    keyword = row["Keyword"]
    keywords.append(keyword)
    # replace comma(,) with dot(.) and space and add extra two keyword
    regex = re.compile('[,]')
    if(regex.search(keyword) != None):
      keywords.append(keyword.replace(",", "."))
      keywords.append(keyword.replace(",", ""))
  
  phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER", )
  patterns = list(nlp.tokenizer.pipe(keywords))
  phrase_matcher.add('keywords', patterns)

  regex = re.compile('[@_!#$%^&*()<>?/\|}{~:.,]')
  reg_patterns = [{"TEXT": {"REGEX": regex}}]
  # phrase_matcher.add('reg_keywords', reg_patterns)
  # phrase_matcher.add('keywords', patterns)
  return phrase_matcher

In [8]:
def filter_unwanted_code(code_list, page_text):
  filtered_code_list = []
  #if re.search("ICD", page_text):
  #match_list = re.findall("(ICD-[0-9][a-zA-z]*\-.+)[ ]", page_text)
  match_list = re.findall("(IC[(A-z)]-[0-9][a-zA-z]*\-.+)[ ]", page_text)
  #print("Match list:\n", match_list)
  for found_code in match_list:
    for code in code_list:
      if code in found_code:
        filtered_code_list.append(code)
  return filtered_code_list

def search_icd_code(txt_list, nlp, code_type):
  pdf_page_vocab = {}
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      # filter the page that have line number instead of code
      if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
        doc = nlp(page_txt)
        code_list = [ent.text for ent in doc.ents]
        page_number = 0
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
        
        # filter the page that dont have ICD string into it
        if code_type == "ICD-9":
          filtered_code_list = filter_unwanted_code(code_list, page_txt)
          pdf_page_vocab[page_number] = filtered_code_list
          # print(f"Page[{txt_file.split('/')[1]}]: {filtered_code_list}")

  return pdf_page_vocab

In [None]:
def search_icd_keyword(txt_list, phrase_matcher, nlp=None):
  page_keyword_dict = {}
  # Step-4: Searching ICD-10 code
  for txt_file in txt_list:
    with open(txt_file, "r") as f:
      page_txt = f.read()
      doc = nlp(page_txt)
      matches = phrase_matcher(doc)

      keyword_list = []
      for match_id, start, end in matches:
        span = doc[start: end]
        keyword_list.append(f"{span}")

      if len(keyword_list) != 0:
        page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
        page_keyword_dict[page_number] = set(keyword_list)
        # print(f"Page[{txt_file.split('/')[1]}]: {set(keyword_list)}")
  return page_keyword_dict

In [9]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

##Spell Correction

In [10]:
import difflib
import re, os
import pickle

# keyword_file_path = "keyword_impairment_v1.txt"
keyword_file_path = "icd_10_keywords.txt"

def spell_check(file_path, word_dict_, keywords_set):
    """Given file_path
        Returns dictionary with
        keys as incorrect word
        and value as correct word"""

    with open( file_path, encoding="utf8") as f:
        page = f.readlines()
    # txt_words = list(map(lambda x:x.lower().strip() , page))

    txt_words = []
    for line in page:
        line = re.sub(r'[^\w\s]', ' ', line)

        # line = line.translate(str.maketrans('', '', string.punctuation))
        if len(line.strip()) > 3:
            for word in line.split():
                if len(word) > 3 and word.isalpha():
                    txt_words.append(word.lower().strip())

    unusual_words = []
    for word in set(txt_words):
        # res=dictionary.meaning(word, disable_errors=True)
        # if res is None:
        #    unusual_words.append(word)
        if word in word_dict_:
            pass
        else:
            unusual_words.append(word)
    spell_change = {}
    for word in set(unusual_words):
        match = difflib.get_close_matches(word, keywords_set, cutoff=0.85)
        if match != []:
            if match[0] == word:  # or match[0][:-1] == word or match[0] == word[:-1] or match[0][1:] == word or match[0] == word[1:]:
                pass
                # print(word , 'spelling is correct')
            else:
                spell_change[word] = match[0]
        #               print(word, '=='*8, match[0])
        else:
            pass
            # print(word, 'no match found')

    return spell_change


def index_spell(page, spell_dict):
    """Returns index for lines where
    spellings are found wrong """

    index_ = set()
    for idx_ , p in enumerate(page):

        for pp in p.split():
            pp = pp.lower()
            pp = re.sub(r'[^\w\s]', '', pp)
            if pp in spell_dict.keys():
                index_.add(idx_)
    return index_

def replace_spelling(pages, spell_dict):
    """Replaces spellings
     given pages and spell_dict"""
    new_pages = []
    for page in pages:
        page = page.lower()
        for k,v in spell_dict.items():
            page = re.sub(k, v, page, flags = re.I)
        new_pages.append(page)
    return new_pages


def load_keywords(filepath=keyword_file_path):
    """loads keywords"""
    with open(filepath) as f:
        keywords = f.readlines()
    return keywords


def load_file(file_path):
    """load files"""
    with open(file_path, encoding='utf-8') as f:
        page = f.readlines()
    return page


def page_preprocess(page):
    page_update = [" ".join(p.split()) for p in page]
    page_update = list(map(str.lower, page_update))
    return page_update


def print_matches(page_update, keywords, spell_dict):
    matches = []
    match_index = []
    true_match = []
    for j in range(len(page_update)):
        pg_update = page_update[j]
        page_update[j] = re.sub(r'[^\w\s]', ' ', page_update[j])
        x = page_update[j].replace(',', '')
        x = x.lower()
        x = x.split()

        for i, key in enumerate(keywords):
            old = key
            key = re.sub(r'[^\w\s]', ' ', key)
            key = key.replace(',', '').lower()
            key = key.split()
            s = difflib.SequenceMatcher(None, x, key)
            match = s.find_longest_match(0, len(x), 0, len(key))
            if match.b == 0:
                if match.size == len(key):
                    #                     print('*'*50)
                    #                     print("matched keyword ------", old)
                    #                     print('*'*50)
                    match_index.append(j)
                    matches.append(old.strip())
                    true_match.append(pg_update)
    return matches, match_index, true_match


def true_file_match(match_phrase, match_index, index_spell_, spell_dict_reverse):
    true_phrase = []
    for idx_, mp in enumerate(match_phrase):
        mp_old = mp

        if match_index[idx_] in index_spell_:
            for k, v in spell_dict_reverse.items():
                mp = re.sub(k, v, mp, flags=re.I)
            if mp_old == mp:
                true_phrase.append('')
            else:
                true_phrase.append(mp)
        else:
            true_phrase.append("")
    return true_phrase

def generate_json(org_match,phrases ):
    json_array = []
    for i, j in zip( org_match[0], phrases[0]):
        if i == '':
            n_dict = {j.lower():j}
        else:
            n_dict = {i.lower():j}
        json_array.append(n_dict)
    return json_array

# Need to include this function in your call
def call(txt_file_path):
    """Given txt_file_path
      returns json array
       {["org":"match", "org2":"match2"]}
       if match not found returns [] #empty list
       """
    with open('dict_words.pkl', 'rb') as f:
        word_dict_ = pickle.load(f)

    # keywords file
    with open(keyword_file_path) as f:
        keywords = f.readlines()

    # lower the keywords, replaces ',' with ''
    key_full = list(map(lambda x: x.lower().strip(), keywords))
    # key_full = list(map(lambda x: x.strip(), keywords))
    keywords = []
    for line in key_full:
        line = line.replace(',', '')
        keywords += line.split()

    # filter words with length smaller the 3
    keywords_set = list(filter(lambda x: len(x) > 3, set(keywords)))

    phrases, total_dict, m_index, s_index, org_match = [], [], [], [], []
    keywords_match = load_keywords()

    #dir_ = "input"

    names = [txt_file_path]

    for enu, name in enumerate(names):

        spell_dict = spell_check(name, word_dict_, keywords_set)
        page = load_file(os.path.join(name))
        new_pages = replace_spelling(page, spell_dict)
        page_update = page_preprocess(new_pages)
        spell_match = index_spell(page, spell_dict)

        match_phrase, match_index, true_pg = print_matches(page_update, keywords_match, spell_dict)
        spell_dict_reverse = {v: k for k, v in spell_dict.items()}
        true_match = true_file_match(match_phrase, match_index, spell_match, spell_dict_reverse)
        phrases.append(match_phrase)
        total_dict.append(spell_dict)
        m_index.append(match_index)
        s_index.append(spell_match)
        org_match.append(true_match)

    file_names = []
    phrases_df = []
    org_match_df = []
    for i in range(len(names)):
        if phrases[i] != []:

            for j in range(len(phrases[i])):
                file_names.append(names[i])
                phrases_df.append(phrases[i][j])
                org_match_df.append(org_match[i][j])

    return generate_json(org_match, phrases)

In [None]:
%%time

# txt-files/page-37.txt
json_arr = call("txt-files/page-0.txt")

CPU times: user 1min 7s, sys: 196 ms, total: 1min 7s
Wall time: 1min 8s


In [None]:
json_arr

[{'migraine with aura, not intractable, without status migrainosus': 'Migraine with aura, not intractable, without status migrainosus'},
 {'decreased white blood cell count, unspecified': 'Decreased white blood cell count, unspecified'},
 {'polpitations': 'Palpitations'},
 {'lower abdominal pain, unspecified': 'Lower abdominal pain, unspecified'},
 {'pain, unspecified': 'Pain, unspecified'},
 {'lipamatosis, not elsewhere classified': 'Lipomatosis, not elsewhere classified'},
 {'overweight': 'Overweight'},
 {'overweight': 'Overweight'}]

In [None]:
wrong_keyword_dict = {}
def get_wrong_keyword_dict(text_path_list):
  
  for idx, file_path in enumerate(text_path_list):
    print(idx, file_path)
    json_arr = call(file_path)
    print("Got json")
    wrong_keyword_list = [list(element.keys())[0] for element in json_arr]
    wrong_keyword_dict[idx] = set(wrong_keyword_list)
  return wrong_keyword_dict

In [None]:
%%time

wrong_keyword_dict = get_wrong_keyword_dict(["txt-files/page-37.txt"])
wrong_keyword_dict

##Test Code

In [14]:
def get_json_array_list(text_path):
  print(f"Running '{text_path}'")
  json_arr = call(text_path)
  print(f"Got json for '{text_path}'")
  return json_arr

In [None]:
MAX_WORKERS = 4

def get_wrong_keyword_list(text_path_list):
  with futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
    response = executor.map(get_json_array_list, sorted(text_path_list))
  return list(response)

In [50]:
MAX_WORKERS = 20

def get_wrong_keyword_dict(text_path_list):
  wrong_keyword_dict = {}

  # take care so that unnecessary thread should not be created
  workers = min(MAX_WORKERS, len(text_path_list))
  with futures.ThreadPoolExecutor(workers) as executor:
    wrong_keyword_dict_list = executor.map(get_json_array_list, sorted(text_path_list))

  for idx, json_arr in enumerate(wrong_keyword_dict_list):
    wrong_keyword_list = [list(element.values())[0] for element in json_arr]
    if wrong_keyword_list: 
      wrong_keyword_dict[idx] = set(wrong_keyword_list)
  print("Before sorting:\n", wrong_keyword_dict)
  return dict(sorted(wrong_keyword_dict.items(), key=lambda item: item[0]))

In [None]:
wrong_keyword_dict2 = get_wrong_keyword_dict(["txt-files/page-37.txt", "txt-files/page-38.txt"])

Raw output:
 [[{'migraine with aura, not intractable, without status migrainosus': 'Migraine with aura, not intractable, without status migrainosus'}, {'decreased white blood cell count, unspecified': 'Decreased white blood cell count, unspecified'}, {'palpitations': 'Palpitations'}, {'lower abdominal pain, unspecified': 'Lower abdominal pain, unspecified'}, {'pain, unspecified': 'Pain, unspecified'}, {'lipomatosis, not elsewhere classified': 'Lipomatosis, not elsewhere classified'}, {'overweight': 'Overweight'}, {'overweight': 'Overweight'}], [{'palpitationsnot': 'Palpitations'}, {'palpitations': 'Palpitations'}, {'lipomatosis, not elsewhere classifiedyou': 'Lipomatosis, not elsewhere classified'}]]
Before sorting:
 {0: {'Lipomatosis, not elsewhere classified', 'Lower abdominal pain, unspecified', 'Decreased white blood cell count, unspecified', 'Pain, unspecified', 'Migraine with aura, not intractable, without status migrainosus', 'Palpitations', 'Overweight'}, 1: {'Lipomatosis, not 

In [None]:
wrong_keyword_dict2

In [15]:
# Step-0: Load prerequisite instance
# create nlp instance
nlp_keyword = spacy.load('en_core_web_sm')

# loading and updating patterns for ICD-10 code
nlp_code10 = English()
nlp_code10.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v3.jsonl")

# loading and updating patterns for ICD-9 code
nlp_code9 = English()
nlp_code9.add_pipe("entity_ruler").from_disk("./icd9_code_patterns-v1.jsonl")

# Load icd_10_code_keywords.csv file
# icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")

# Creating ICD-10 keyword pattern
# phrase_matcher = make_icd_keyword_pattern(icd_code_kerword_df, nlp_keyword)

<spacy.pipeline.entityruler.EntityRuler at 0x7f65f72b5e60>

In [76]:
%%time

ocr_pdf_files_path = "ocr-pdf-files"
for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_name = f"{ocr_pdf_files_path}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  page_code_dict = search_icd_code(txt_list, nlp_code10, code_type="ICD-10")

  # Step-4: Searching ICD-10 keyword
  # page_keyword_dict = search_icd_keyword(txt_list, phrase_matcher, nlp_keyword)
  # wrong_keyword_dict = get_wrong_keyword_dict(txt_list)
  wrong_keyword_dict = get_wrong_keyword_dict(txt_list)
  print("After sorting:\n", wrong_keyword_dict)

  # Step-7: Highlighting ICD-10 code and keyword into pdf
  pdf_output_file, txt_output_file = highlight_icd_code_and_keyword(page_code_dict, wrong_keyword_dict, pdf_file_name, cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and keyword")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # Step-8: Searching ICD-9 code
  page_code9_dict = search_icd_code(txt_list, nlp_code9, code_type="ICD-9")

  # Step-9: Highlighting ICD-9 code into pdf
  output_file_name = highlight_icd_code_and_keyword(page_code9_dict, page_keyword_dict=None, pdf_file_name=pdf_output_file, cords_file_name=cords_file_name, code_type="ICD-9")
  print(f"File[{output_file_name}] is saved after highlighting ICD-9 code")
  
  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

Running 'txt-files/page-0.txt'
Running 'txt-files/page-1.txt'
Running 'txt-files/page-10.txt'
Running 'txt-files/page-100.txt'
Running 'txt-files/page-101.txt'
Running 'txt-files/page-102.txt'
Running 'txt-files/page-103.txt'
Running 'txt-files/page-104.txt'
Running 'txt-files/page-105.txt'
Running 'txt-files/page-106.txt'Running 'txt-files/page-107.txt'Running 'txt-files/page-108.txt'


Running 'txt-files/page-109.txt'
Running 'txt-files/page-11.txt'Running 'txt-files/page-110.txt'

Running 'txt-files/page-111.txt'
Running 'txt-files/page-112.txt'Running 'txt-files/page-113.txt'
Running 'txt-files/page-114.txt'

Running 'txt-files/page-115.txt'
Got json for 'txt-files/page-1.txt'
Running 'txt-files/page-116.txt'
Got json for 'txt-files/page-108.txt'
Running 'txt-files/page-117.txt'
Got json for 'txt-files/page-105.txt'
Running 'txt-files/page-118.txt'
Got json for 'txt-files/page-113.txt'
Running 'txt-files/page-119.txt'
Got json for 'txt-files/page-106.txt'
Running 'txt-files/page-12

In [74]:
!rm -rf ocr-pdf-files
!mkdir -p ocr-pdf-files

In [75]:
purge("ocr-pdf-files/*.txt")
purge("ocr-pdf-files/*_output.pdf")
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [None]:
!zip output.zip ocr-pdf-files/*_output_cords.txt ocr-pdf-files/*_output_output.pdf

In [86]:
file_path = "ocr-pdf-files"
file_name = os.path.basename("ocr-pdf-files/Redacted_Sample.pdf")
file = os.path.splitext(os.path.basename("ocr-pdf-files/Redacted_Sample.pdf"))

print(file_name)
print(file[0])  # returns tuple of string
 
print(file[0] + file[1])
print(os.path.join(file_path, file_name))

Redacted_Sample.pdf
Redacted_Sample
Redacted_Sample.pdf
ocr-pdf-files/Redacted_Sample.pdf


##Data preprocessing

In [None]:
columns = []
with open("icd_10_keywords.txt", "r") as f:
  txt_lines = f.readlines()
  for line in txt_lines:
    columns.append(line.strip("\n"))
print(columns[:10])

['Cholera due to Vibrio cholerae 01, biovar cholerae', 'Cholera due to Vibrio cholerae 01, biovar eltor', 'Cholera, unspecified', 'Typhoid fever, unspecified', 'Typhoid meningitis', 'Typhoid fever with heart involvement', 'Typhoid pneumonia', 'Typhoid arthritis', 'Typhoid osteomyelitis', 'Typhoid fever with other complications']


In [None]:
data_keyword_df = pd.DataFrame(columns, columns=["Keyword"])
data_keyword_df.head()

Unnamed: 0,Keyword
0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,"Cholera, unspecified"
3,"Typhoid fever, unspecified"
4,Typhoid meningitis


In [None]:
data_keyword_df.to_csv("icd_10_keywords.csv", index=False)

In [None]:
data_code_df = pd.read_csv("icd_10_codes.csv")
data_code_df.head()

Unnamed: 0,ICD-10
0,A00.0
1,A00.1
2,A00.9
3,A01.00
4,A01.01


In [None]:
data_code_df["ICD-10"].head()

0     A00.0
1     A00.1
2     A00.9
3    A01.00
4    A01.01
Name: ICD-10, dtype: object

In [None]:
data_keyword_df["Keyword"].head()

0    Cholera due to Vibrio cholerae 01, biovar chol...
1      Cholera due to Vibrio cholerae 01, biovar eltor
2                                 Cholera, unspecified
3                           Typhoid fever, unspecified
4                                   Typhoid meningitis
Name: Keyword, dtype: object

In [None]:
icd_code_kerword_df = pd.DataFrame().assign(Code=data_code_df['ICD-10'], Keyword=data_keyword_df['Keyword'])
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


In [None]:
icd_code_kerword_df.to_csv("icd_10_code_keywords.csv", index=False)

In [None]:
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


In [None]:
def make_icd_code_pattern(icd_code_df, code_type="ICD-10"):
  patterns = []
  for _, row in icd_code_df.iterrows():

    # add default pattern
    patterns.append({"label": code_type, "pattern": row["Code"]})

    # create alternate pattern
    code_patterns = []
    code_arr = row["Code"].split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"
      code4 = f"{code_arr[0]} {code_arr[1]}"
      code_patterns.extend([code1, code2, code3, code4])
      # handle if the first char of code is missing
      alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6"}
      for key, val in alphabats.items():
        if code4.split()[0].startswith(key):
          code5 = code4.replace(key, val)
          code6 = row["Code"].replace(key, val)
          code_patterns.extend([code5, code6])

    # handle if the "." is missing
    if code_type == "ICD-9":
      code_arr = row["Code"].split(".")
      if len(code_arr) > 1:
        code7 = row["Code"].replace(".", "")
        code_patterns.extend([code7])

    for code_pattern in code_patterns:
      if len(code_pattern) > 1:
        patterns.append({"label": code_type, "pattern": code_pattern})
  return patterns

In [None]:
nlp = English()

In [None]:
icd_code_v2_df = pd.read_csv("icd_10_codes-v2.csv")
icd_code_v2_df = icd_code_v2_df.drop_duplicates()
patterns = make_icd_code_pattern(icd_code_v2_df)

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save to json file
ruler.to_disk("./icd10_code_patterns-v2.jsonl")

In [None]:
nlp.remove_pipe("entity_ruler")

In [None]:
icd9_code_v1_df = pd.read_csv("icd_9_codes-v1.csv")
patterns = make_icd_code_pattern(icd9_code_v1_df, "ICD-9")

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# save to json file
ruler.to_disk("./icd9_code_patterns-v1.jsonl")

##All Steps Together

In [None]:
!rm -rf txt-files
!rm -rf pdf-files

In [None]:
# Step-1: spliting pdf file
pdf_file_name = "Redacted_Sample.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

In [None]:
txt_list[37]

'txt-files/page-37.txt'

In [None]:
nlp_code = English()
# Step-3: loading and updating patterns to Spacy
nlp_code.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v2.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7fc6e8554730>

In [None]:
# Step-4: Searching ICD-10 code
page_code10_dict = search_icd_code(txt_list, nlp_code)

In [None]:
# Step-5: Creating ICD-10 keyword pattern
nlp_keyword = spacy.load('en_core_web_sm')
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
phrase_matcher = make_icd_keyword_pattern(icd_code_kerword_df, nlp_keyword)

In [None]:
# Step-6: Searching ICD-10 keyword
page_keyword_dict = search_icd_keyword(["txt-files/page-37.txt"], phrase_matcher, nlp_keyword)

In [None]:
page_keyword_dict

{37: {'Decreased white blood cell count. unspecified',
  'Migraine with aura, not intractable, without status migrainosus',
  'Palpitations',
  'lipomatosis, not elsewhere classified',
  'lower abdominal pain, unspecified',
  'overweight',
  'pain, unspecified'}}

In [None]:
wrong_keyword_list = [list(element.keys())[0] for element in json_arr]

In [None]:
wrong_keyword_list

['migraine with aura, not intractable, without status migrainosus',
 'decreased white blood cell count, unspecified',
 'polpitations',
 'lower abdominal pain, unspecified',
 'pain, unspecified',
 'lipamatosis, not elsewhere classified',
 'overweight',
 'overweight']

In [None]:
page_keyword_dict2 = {"37": set(wrong_keyword_list)}
page_keyword_dict2

{'37': {'decreased white blood cell count, unspecified',
  'lipamatosis, not elsewhere classified',
  'lower abdominal pain, unspecified',
  'migraine with aura, not intractable, without status migrainosus',
  'overweight',
  'pain, unspecified',
  'polpitations'}}

In [None]:
# Step-7: Highlighting ICD-10 code and keyword into pdf
output_file_name = highlight_icd_code_and_keyword(page_code10_dict, page_keyword_dict, pdf_file_name, code_type="ICD-10")
print(f"File[{output_file_name}] is saved after highlighting ICD-10 code and keyword")

File[('9929_final_output_output.pdf', '9929_final_output_cords.txt')] is saved after highlighting ICD-10 code and keyword


In [None]:
nlp_code.remove_pipe("entity_ruler")
del icd_code_kerword_df
del page_code10_dict

In [None]:
# Step-8: loading and updating patterns to Spacy
nlp_code.add_pipe("entity_ruler").from_disk("./icd9_code_patterns-v1.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7fc6de1db230>

In [None]:
# Step-9: Searching ICD-9 code
page_code9_dict = search_icd_code(txt_list, nlp_code, code_type="ICD-9")

# Step-10: Highlighting ICD-9 code into pdf
output_file_name = highlight_icd_code_and_keyword(page_code9_dict, page_keyword_dict, output_file_name[0], code_type="ICD-9")
print(f"File[{output_file_name}] is saved after highlighting ICD-9 code")

File[('9929_final_output_output_output.pdf', '9929_final_output_output_cords.txt')] is saved after highlighting ICD-9 code


In [None]:
!tar -czvf text_files.tar.gz txt-files/

##Highlight Multiple Files

In [None]:
# Step-0: Load prerequisite instance
# create nlp instance
nlp_keyword = spacy.load('en_core_web_sm')

# Load icd_10_code_keywords.csv file
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")

# loading and updating patterns for ICD-10 code
nlp_code10 = English()
nlp_code10.add_pipe("entity_ruler").from_disk("./icd10_code_patterns-v2.jsonl")

# loading and updating patterns for ICD-9 code
nlp_code9 = English()
nlp_code9.add_pipe("entity_ruler").from_disk("./icd9_code_patterns-v1.jsonl")

# Creating ICD-10 keyword pattern
phrase_matcher = make_icd_keyword_pattern(icd_code_kerword_df, nlp_keyword)

In [None]:
%%time

ocr_pdf_files_path = "ocr-pdf-files"
for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_name = f"{ocr_pdf_files_path}/{pdf_file}"
  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list)
  print(txt_list[:5])

  # Step-3: Searching ICD-10 code
  page_code_dict = search_icd_code(txt_list, nlp_code10, code_type="ICD-10")

  # Step-4: Searching ICD-10 keyword
  page_keyword_dict = search_icd_keyword(txt_list, phrase_matcher, nlp_keyword)

  # Step-7: Highlighting ICD-10 code and keyword into pdf
  pdf_output_file, txt_output_file = highlight_icd_code_and_keyword(page_code_dict, page_keyword_dict, pdf_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code and keyword")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # Step-8: Searching ICD-9 code
  page_code9_dict = search_icd_code(txt_list, nlp_code9, code_type="ICD-9")

  # Step-9: Highlighting ICD-9 code into pdf
  output_file_name = highlight_icd_code_and_keyword(page_code9_dict, page_keyword_dict, pdf_output_file, code_type="ICD-9")
  print(f"File[{output_file_name}] is saved after highlighting ICD-9 code")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

In [None]:
!rm -rf ocr-pdf-files

In [None]:
!mkdir -p ocr-pdf-files

In [None]:
purge("ocr-pdf-files/*.txt")
purge("ocr-pdf-files/*_output.pdf")
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [None]:
!zip output.zip ocr-pdf-files/*_output_cords.txt ocr-pdf-files/*_output_output.pdf

In [None]:
!zip output_txt_pdf_files.zip pdf-files/* txt-files/*

##Text Extraction

In [None]:
def split_pdf(pdf_path, pdf_file_name):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/{pdf_file_name}-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"{pdf_file_name}-{page}.pdf")
  return pdf_list

def extract_text_from_pdf(pdf_list, file_name):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/{file_name}-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/{file_name}-{str(i)}.txt")
    i += 1
  return txt_file_list

In [None]:
ocr_pdf_files_path = "ocr-pdf-files"
for pdf_file in os.listdir(ocr_pdf_files_path):
  pdf_file_path = f"{ocr_pdf_files_path}/{pdf_file}"
  file_name = pdf_file.split(".")[0]
  # Step-1: splitting pdf file
  pdf_list = split_pdf(pdf_file_path, file_name)

  # Step-2: Extracting text from pdf
  txt_list = extract_text_from_pdf(pdf_list, file_name)

In [None]:
!zip output_txt_pdf_files.zip pdf-files/* txt-files/*

##Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

doc = nlp("""
Graham Greene is his favorite author. He wrote his first book when he was a hundred and fifty years old.
While writing this book, he had to fend off aliens and dinosaurs. Greene's second book might not have been written by him. 
Greene's cat in its deathbed testimony alleged that it was the original writer of the book. The fact that plot of the book revolves around 
rats conquering the world, lends credence to the idea that only a cat could have been the true writer of such an inane book.""")

matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA": "write"},{"OP": "*"},{"LEMMA": "book"}]
matcher.add("testy", [pattern])

print("----- Using Matcher -----")
for sent in doc.sents:
    if matcher(sent):
        print(sent.text)

----- Using Matcher -----
He wrote his first book when he was a hundred and fifty years old.

While writing this book, he had to fend off aliens and dinosaurs.


In [None]:
print("----- Using Dependency Matcher -----")

deppattern = [
        {"RIGHT_ID": "wrote", "RIGHT_ATTRS": {"LEMMA": "write"}},
        {"LEFT_ID": "wrote", "REL_OP": ">", "RIGHT_ID": "book", 
            "RIGHT_ATTRS": {"LEMMA": "book"}}
        ]

from spacy.matcher import DependencyMatcher

dmatcher = DependencyMatcher(nlp.vocab)

dmatcher.add("BOOK", [deppattern])

for _, (start, end) in dmatcher(doc):
    print(doc[start].sent)

In [None]:
regex = re.compile('[@_!#$%^&*()<>?/\|}{~:.,]')
True if(regex.search("Decreased white blood cell count, unspecified") == None) else False

False

In [None]:
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

In [None]:
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

Found match: United States
Found match: United States
Found match: U.S.
Found match: US
