<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/15_keyword_correction_and_highliting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

[PDF-to-TEXT](https://pypi.org/project/pdftotext/)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF
!pip install textdistance

Just restart the colab environment.

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os
import string

import pdb

import fitz
import cv2 
import pdftotext
from PyPDF2 import PdfFileReader, PdfFileWriter
from io import BytesIO
from PIL import Image
from matplotlib import pyplot as plt

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from collections import Counter
import textdistance

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [3]:
!mkdir pdf-files
!mkdir txt-files
!mkdir correct-txt-files

In [15]:
pdf_files_path = "pdf-files"
txt_files_path = "txt-files"
correct_txt_files_path = "correct-txt-files"

In [4]:
nlp = spacy.load('en_core_web_sm')

##Define some functions

In [5]:
def split_pdf(pdf_path):
  pdf_in_file = open(pdf_path, "rb")
  pdf = PdfFileReader(pdf_in_file)
  pdf_list = []
  for page in range(pdf.numPages):
      inputpdf = PdfFileReader(pdf_in_file)
      output = PdfFileWriter()
      output.addPage(inputpdf.getPage(page))
      with open(f"{pdf_files_path}/page-{page}.pdf", "wb") as outputStream:
          output.write(outputStream)
          pdf_list.append(f"page-{page}.pdf")
  return pdf_list

In [6]:
def extract_text_from_pdf(pdf_list):
  txt_file_list = []
  i = 0
  for pdf_file in pdf_list:
    with open(os.path.join(pdf_files_path, pdf_file), "rb") as f:
      pdf = pdftotext.PDF(f)
    
    # Read all the text into one string
    pdf_text = "\n\n".join(pdf)

    # write text into file
    with open(f"{txt_files_path}/page-{str(i)}.txt", "a") as f:
      f.write(pdf_text)
    txt_file_list.append(f"{txt_files_path}/page-{str(i)}.txt")
    i += 1
  return txt_file_list

In [9]:
def highlight_icd10_keyword(page_keyword_dict, pdf_file_name):
  pdf_file = fitz.open(pdf_file_name)
  for page_num, page in enumerate(pdf_file):
    if page_num in page_keyword_dict:
      #print(f"Page dict: {pdf_page_dict[page_num]}")
      for keyword in page_keyword_dict[page_num]:
        text_to_be_highlighted = keyword
        highlight = page.search_for(text_to_be_highlighted)
        for inst in highlight:
          highlight = page.add_highlight_annot(inst)
          highlight.update()
          highlight = page.search_for(text_to_be_highlighted)
          print(f"Page-{page_num}: ", keyword, highlight, end='\n')
  pdf_file.save(f"{pdf_file_name.split('.')[0]}_output.pdf", garbage=4, deflate=True, clean=True)

In [8]:
def make_icd_10_keyword_pattern(icd_10_keyword_df):
  patterns = []
  for _, row in icd_10_keyword_df.iterrows():
    patterns.append(row["Keyword"])
  return patterns

In [None]:
split_str = "Decreased white blood cell count, unspecified".split(",")
split_str

['Decreased white blood cell count', ' unspecified']

In [None]:
for ks in split_str:
  print(ks.strip().lower())

decreased white blood cell count
unspecified


In [None]:
split_str[0]

'Enterocolitis due to Clostridium difficile not specified as recurrent'

In [None]:
exclude_keywords_list = ["pain", "activity", "sister", "stress", "counseling", "injury", "wife", "male", "female", "low", "high", "medium",
							"other", "never", "current", "none", "running", "to pain", "right", "left", "complete", "active",
							"face", "neck", "and neck", "stable", "primary", "shoulder", "etc.", "x", "head", "multiple", "45", "46", "stage 1", "grade 1",
              "slight", "adult", "jaw", "hip", "right hip", "left hip", "thigh", "hand", "left arm", "right arm", "right foot", "left foot",
              "type 2", "mild", "severe"
						]
exclude_keywords_list.append(list(nlp.Defaults.stop_words))

def make_icd_10_keyword_pattern(icd_10_keyword_df):
  patterns = []
  for _, row in icd_10_keyword_df.iterrows():
    split_keywords = row["Keyword"].split(",")
    for keyword in split_keywords:
      if keyword.strip().lower() not in exclude_keywords_list:
        patterns.append(keyword.strip())
  return patterns

##Data preprocessing

In [None]:
columns = []
with open("icd_10_keywords.txt", "r") as f:
  txt_lines = f.readlines()
  for line in txt_lines:
    columns.append(line.strip("\n"))
print(columns[:10])

['Cholera due to Vibrio cholerae 01, biovar cholerae', 'Cholera due to Vibrio cholerae 01, biovar eltor', 'Cholera, unspecified', 'Typhoid fever, unspecified', 'Typhoid meningitis', 'Typhoid fever with heart involvement', 'Typhoid pneumonia', 'Typhoid arthritis', 'Typhoid osteomyelitis', 'Typhoid fever with other complications']


In [None]:
data_keyword_df = pd.DataFrame(columns, columns=["Keyword"])
data_keyword_df.head()

Unnamed: 0,Keyword
0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,"Cholera, unspecified"
3,"Typhoid fever, unspecified"
4,Typhoid meningitis


In [None]:
data_keyword_df.to_csv("icd_10_keywords.csv", index=False)

In [None]:
data_code_df = pd.read_csv("icd_10_codes.csv")
data_code_df.head()

Unnamed: 0,ICD-10
0,A00.0
1,A00.1
2,A00.9
3,A01.00
4,A01.01


In [None]:
data_code_df["ICD-10"].head()

0     A00.0
1     A00.1
2     A00.9
3    A01.00
4    A01.01
Name: ICD-10, dtype: object

In [None]:
data_keyword_df["Keyword"].head()

0    Cholera due to Vibrio cholerae 01, biovar chol...
1      Cholera due to Vibrio cholerae 01, biovar eltor
2                                 Cholera, unspecified
3                           Typhoid fever, unspecified
4                                   Typhoid meningitis
Name: Keyword, dtype: object

In [None]:
icd_code_kerword_df = pd.DataFrame().assign(Code=data_code_df['ICD-10'], Keyword=data_keyword_df['Keyword'])
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


In [None]:
icd_code_kerword_df.to_csv("icd_10_code_keywords.csv", index=False)

In [None]:
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
icd_code_kerword_df.head()

Unnamed: 0,Code,Keyword
0,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A00.9,"Cholera, unspecified"
3,A01.00,"Typhoid fever, unspecified"
4,A01.01,Typhoid meningitis


##Keyword tokenization

In [24]:
def tokenize_keyword(text_file):
  stop_words = set(stopwords.words("english"))
  with open(text_file, "r") as f:
    txt_lines = f.readlines()
    keyword_list = [line.strip("\n") for line in txt_lines]
    keyword_text = " ".join(keyword_list)
    keyword_raw_tokens = word_tokenize(keyword_text)
    keyword_tokens = [token for token in keyword_raw_tokens if token.lower() not in stop_words # remove stop-words
                      and re.sub(r"\b[0-9]+\b\s*", "", token)                                  # remove numeric value
                      and token.isalpha()                                                      # remove alpha numeric value
                      and not token.isdigit()]                                                 # remove digit value
  return set(list(keyword_tokens))

In [31]:
keyword_tokens = tokenize_keyword("icd_10_keywords.txt")
print(list(keyword_tokens)[:10])

['Adverse', 'cochlear', 'Biological', 'Subcorneal', 'Webbed', 'Complication', 'rubella', 'blunt', 'Chromosomal', 'leaving']


In [14]:
"cholera" in keyword_tokens

False

**Keyword Autocorrection**

In [32]:
# vocabulary
vocab = set(keyword_tokens)
print(f"The first 10 words in our dictionary are: \n{list(keyword_tokens)[:10]}")
print(f"The dictionary has {len(vocab)} words.")

The first 10 words in our dictionary are: 
['Adverse', 'cochlear', 'Biological', 'Subcorneal', 'Webbed', 'Complication', 'rubella', 'blunt', 'Chromosomal', 'leaving']
The dictionary has 8234 words.


In [33]:
# Let's calculate the frequency of those words
word_freq_dict = {}
word_freq_dict = Counter(keyword_tokens)
print(word_freq_dict.most_common()[0:10])

[('Adverse', 1), ('cochlear', 1), ('Biological', 1), ('Subcorneal', 1), ('Webbed', 1), ('Complication', 1), ('rubella', 1), ('blunt', 1), ('Chromosomal', 1), ('leaving', 1)]


In [34]:
# get the probability of occurrence of each word 
probs = {}
total = sum(word_freq_dict.values())
for k in word_freq_dict.keys():
  probs[k] = word_freq_dict[k] / total

In [10]:
def get_autocorrect(word):
  #input_word = word.lower()
  if word not in vocab:
    similarities = [1 - (textdistance.Levenshtein(qval=1).distance(v, word)) for v in word_freq_dict.keys()]
    df = pd.DataFrame.from_dict(probs, orient="index").reset_index()
    df = df.rename(columns={"index": "word", 0: "Prob"})
    df["Similarity"] = similarities
    output = df.sort_values(["Similarity", "Prob"], ascending=False).head(1)

    pred_word = output["word"].iloc[0] 
    print(pred_word)
    print(f"Found match > {pred_word}")

In [11]:
"Migraine" in vocab

True

In [12]:
get_autocorrect("Megraine")

Migraine
Found match > Migraine


In [46]:
def clean_and_tokenize(text):
  stop_words = set(stopwords.words("english"))
  #with open(text_file, "r") as f:
  #txt_lines = text.readlines()
  keyword_list = [text.strip("\n") for line in text.split()]
  print(keyword_list)
  keyword_text = " ".join(keyword_list)
  keyword_raw_tokens = word_tokenize(keyword_text)
  keyword_tokens = [token.lower() for token in keyword_raw_tokens if token.lower() not in stop_words # remove stop-words
                    and not re.sub(r"\b[0-9]+\b\s*", "", token)                                  # remove numeric value
                    and not token.isalpha()                                                      # remove alpha numeric value
                    and not token.isdigit()]                                                 # remove digit value
  return set(list(keyword_tokens))

In [101]:
def clean_string(text, stem="None"):
  final_string = ""

  # Make lower
  #text = text.lower()

  # Remove line breaks
  text = re.sub(r'\n', '', text)

  # Remove puncuation
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)

  # Remove stop words
  text = text.split()
  useless_words = nltk.corpus.stopwords.words("english")
  useless_words = useless_words + ['hi', 'im']

  text_filtered = [word for word in text if not word in useless_words]

  # Remove numbers
  text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

  # Stem or Lemmatize
  if stem == 'Stem':
      stemmer = PorterStemmer() 
      text_stemmed = [stemmer.stem(y) for y in text_filtered]
  elif stem == 'Lem':
      lem = WordNetLemmatizer()
      text_stemmed = [lem.lemmatize(y) for y in text_filtered]
  elif stem == 'Spacy':
      text_filtered = nlp(' '.join(text_filtered))
      text_stemmed = [y.lemma_ for y in text_filtered]
  else:
      text_stemmed = text_filtered

  final_string = ' '.join(text_stemmed)

  return final_string

In [57]:
with open("txt-files/page-1.txt", "r") as f:
  wrong_text = f.read()
my_text = clean_string(wrong_text, "Spacy")
my_text

'male   history relative status age medical history onset cod note father live   insamnia p    mother live   insamnia p    retinal disease p    megraine p    cataract p    maternal stroke p    grandfatherapsextract ® right reservedthis document contain proprietary confidential personal datum intend view use authorize user document may print distribute unauthorized purposewhatsoever copyright c   right reserve innodata synodex llc'

In [None]:
text_one  = "Your word seems to be correct".replace("word", "words")
text_one

'Your words seems to be correct'

In [None]:
text_one  = "Your word seems to be correct"
word_list = ["words", "seem"]
for i, word in enumerate(["word", "seems"]):
  text_one  = text_one.replace(word, word_list[i])
text_one

'Your words seem to be correct'

In [19]:
def replace_content(dict_replace, target):
  """Based on dict, replaces key with the value on the target."""
  for check, replacer in list(dict_replace.items()):
    target = re.sub(check, replacer, target)
    # target = target.replace(check, replacer)
  return target

In [None]:
for _ in range(1):
  file_open = open("foo.txt", 'r')
  file_read = file_open.read()
  file_open.close()

  new_file_open = open("bar.txt", 'w')


  def replace_content(dict_replace, target):
      """Based on dict, replaces key with the value on the target."""

      for check, replacer in list(dict_replace.items()):
          target = re.sub(check, replacer, target)
          # target = target.replace(check, replacer)

      return target

  # check : replacer
  dict_replace = {
      'Insamnia': 'Insomnia',
      'Megraine': 'Migraine',
      'dolor': '$$$$$'
  }

  new_content = replace_content(dict_replace, file_read)
  new_file_open.write(new_content)
  new_file_open.close()

  # Test
  print(file_read)
  # Lorem ipsum dolor sit amet, lorem ipsum dolor sit amet

  print(new_content)
  # Lorem XXXXXXX $$$$$ sit *********** lorem XXXXXXX $$$$$ sit amet

In [98]:
def get_autocorrect(cleaned_text):
  wrong_correct_word_dict = {}
  correct_list = []
  wrong_list = []

  for word in cleaned_text.split():
    if word not in vocab and len(word) > 1:
      try:
        similarities = [1 - (textdistance.Levenshtein(qval=1).distance(v, word)) for v in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient="index").reset_index()
        df = df.rename(columns={"index": "word", 0: "Prob"})
        df["Similarity"] = similarities
        output = df.sort_values(["Similarity", "Prob"], ascending=False).head()
        # get most similarity score word
        pred_word = output["word"].iloc[0] 
      except Exception  as err:
        print(err)
      if pred_word in vocab:
        wrong_correct_word_dict.update({word: pred_word})
        correct_list.append(pred_word)
        wrong_list.append(word)
  return wrong_correct_word_dict, correct_list, wrong_list

def replace_content(dict_replace, target):
  """Based on dict, replaces key with the value on the target."""
  for check, replacer in list(dict_replace.items()):
    print(check, ">>", replacer)
    target = re.sub(check, replacer, target)
    # target = target.replace(check, replacer)
  return target

def autocorrect_keyword(txt_list):
  auto_correct_page_dict = {}
  correct_txt_file_list = []

  for i, txt_file_name in enumerate(txt_list):
    file_open = open(txt_file_name, "r")
    wrong_text = file_open.read()
    file_open.close()
    cleaned_text = clean_string(wrong_text, "Spacy")

    new_file_name = f"{correct_txt_files_path}/page-{str(i)}.txt"
    new_file_open = open(new_file_name, "w")
    
    # get auto correct word
    wrong_correct_word_dict, correct_list, wrong_list = get_autocorrect(cleaned_text)

    wrong_correct_word_dict2 = {
      'Insamnia': 'Insomnia',
      'Megraine': 'Migraine'
    }
    
    print(wrong_correct_word_dict)
    new_content = replace_content(wrong_correct_word_dict, wrong_text)
    print(new_content)

    # write corrected text into directory
    new_file_open.write(new_content)
    new_file_open.close()
    #with open(f"{correct_txt_files_path}/page-{str(i)}.txt", "a") as out_f:
    #  out_f.write(correct_text)

    # update correct_page_dict 
    auto_correct_page_dict.update({
      "page": i, 
      "wrong_list": wrong_list, 
      "correct_list": correct_list
    })
    correct_txt_file_list.append(new_file_name)

  return auto_correct_page_dict, correct_txt_file_list

In [84]:
"llc" in vocab

False

In [None]:
clean_text = """
male   history relative status age medical history onset cod note father live   insamnia p    mother live   insamnia p    retinal disease p    megraine p    cataract p    maternal stroke p    grandfatherapsextract ® right reservedthis document contain proprietary confidential personal datum intend view use authorize user document may print distribute unauthorized purposewhatsoever copyright c   right reserve innodata synodex llc
"""

get_autocorrect(clean_text)

In [None]:
if "Migraine" in correct_list:
  index_pos = correct_list.index("Migraine")
  print(wrong_list[index_pos])

Megraine


In [None]:
"Alcohol Use".split(" ")

['Alcohol']

In [23]:
def get_wrong_keyword_dict(page_keyword_dict, auto_correct_page_dict):
  page_wrong_keyword_dict = {}
  for i in range(len(page_keyword_dict)):
    wrong_keyword_list = []
    for keyword in page_keyword_dict[i]:
      original_keyword = keyword
      for word in keyword.split(" "):
        if word in auto_correct_page_dict["correct_list"]:
          # get wrong word w.r.t to correct one
          wrong_word = auto_correct_page_dict["wrong_list"][auto_correct_page_dict["correct_list"].index(word)]
          original_keyword = original_keyword.replace(word, wrong_word)
      wrong_keyword_list.append(original_keyword)
    page_wrong_keyword_dict[i] = wrong_keyword_list
  return page_wrong_keyword_dict

##Create Phrase matcher

In [12]:
icd_code_kerword_df = pd.read_csv("icd_10_code_keywords.csv")
keywords = make_icd_10_keyword_pattern(icd_code_kerword_df)
keywords[:10]

['Cholera due to Vibrio cholerae 01, biovar cholerae',
 'Cholera due to Vibrio cholerae 01, biovar eltor',
 'Cholera, unspecified',
 'Typhoid fever, unspecified',
 'Typhoid meningitis',
 'Typhoid fever with heart involvement',
 'Typhoid pneumonia',
 'Typhoid arthritis',
 'Typhoid osteomyelitis',
 'Typhoid fever with other complications']

In [13]:
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = list(nlp.tokenizer.pipe(keywords))
phrase_matcher.add('keywords', patterns)
#phrase_matcher.add('keywords', p1)

##Text extraction

In [16]:
# Step-1: spliting pdf file
pdf_file_name = "Redacted_Sample.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = extract_text_from_pdf(pdf_list)

##Keyword Autocorrection

In [102]:
auto_correct_page_dict = autocorrect_keyword(["txt-files/page-1.txt"])

{'Relative': 'relative', 'no': 'two', 'Age': 'age', 'COD': 'WMD', 'Note': 'rate', 'Father': 'Bather', 'Insamnia': 'Insomnia', 'Mother': 'mother', 'Megraine': 'Migraine', 'Cataracts': 'Cataract', 'grandfatherapsextract': 'therapeutic', 'all': 'ball', 'Rights': 'Right', 'ReservedThis': 'mesenteritis', 'document': 'documented', 'contain': 'mountain', 'proprietary': 'parietal', 'confidential': 'incidental', 'datum': 'latum', 'intend': 'intent', 'view': 'diet', 'authorize': 'authority', 'user': 'used', 'this': 'chin', 'may': 'Day', 'print': 'Wrist', 'distribute': 'vestibule', 'unauthorized': 'motorized', 'purposewhatsoever': 'purposes', 'Copyright': 'right', 'reserve': 'desire', 'Innodata': 'nodosa', 'Synodex': 'nodes', 'LLC': 'AL'}
Relative >> relative
no >> two
Age >> age
COD >> WMD
Note >> rate
Father >> Bather
Insamnia >> Insomnia
Mother >> mother
Megraine >> Migraine
Cataracts >> Cataract
grandfatherapsextract >> therapeutic
all >> ball
Rights >> Right
ReservedThis >> mesenteritis
docu

In [59]:
auto_correct_page_dict

({'page': 0,
  'wrong_list': ['cod',
   'note',
   'insamnia',
   'insamnia',
   'megraine',
   'grandfatherapsextract',
   'reservedthis',
   'document',
   'contain',
   'proprietary',
   'confidential',
   'datum',
   'intend',
   'view',
   'authorize',
   'user',
   'document',
   'may',
   'print',
   'distribute',
   'unauthorized',
   'purposewhatsoever',
   'copyright',
   'reserve',
   'innodata',
   'synodex',
   'llc'],
  'correct_list': ['cow',
   'none',
   'insomnia',
   'insomnia',
   'migraine',
   'therapeutic',
   'detergents',
   'documented',
   'mountain',
   'parietal',
   'incidental',
   'latum',
   'intent',
   'diet',
   'authority',
   'used',
   'documented',
   'Day',
   'Wrist',
   'vestibule',
   'motorized',
   'purposes',
   'right',
   'desire',
   'innocent',
   'nodes',
   'leg']},
 ['correct-txt-files/page-0.txt'])

In [124]:
with open("my.txt", "r") as f:
  wrong_text = f.read()
#my_text = clean_string(wrong_text, "Spacy")

my_text1 = ' '.join([word_dict[idx] if idx in word_dict else idx for idx in wrong_text.split()])
#my_text1 = replace_all(my_text, word_dict)
my_text1

', Male, SF0063198200 Family History Relative No. Status Age Medical History Onset COD Note Father living 66 Insomnia P 3, L30 Mother living 69 Insomnia P 3, L29 Retinal Disease P 3, L29 Migraine P 52, L30 Cataracts P 87, L13 maternal stroke P 52, L31 grandfather APS.Extract® All Rights Reserved This document contains proprietary, confidential and personal data and is intended for viewing and use only by authorized users. This document pay not be printed or distributed for any unauthorized purpose whatsoever. Copyright (c) 2020. All rights reserved. Innodata Synodex, LLC'

In [118]:
my_text = """
, Male,                                                                                                                                                                              SF0063198200
Family History
      Relative           No.         Status          Age                    Medical History                      Onset        COD                              Note
 Father                         living            66                                                                                      Insamnia                                                 P 3, L30
 Mother                         living            69                                                                                      Insamnia                                                 P 3, L29
                                                                                                                                          Retinal Disease                                          P 3, L29
                                                                                                                                          Megraine                                               P 52, L30
                                                                                                                                          Cataracts                                              P 87, L13
 maternal                                                    stroke                                                                                                                              P 52, L31
 grandfather
APS.Extract® All Rights Reserved
This document contains proprietary, confidential and personal data and is intended for viewing and use only by authorized users. This document pay not be printed or distributed for any unauthorized purpose
whatsoever. Copyright (c) 2020. All rights reserved. Innodata Synodex, LLC
"""
word_dict = {"Insamnia": "Insomnia", "Megraine": "Migraine"}
p_word = "Insomnia"

my_text1 = ' '.join([word_dict[idx] if idx in word_dict else idx for idx in wrong_text.split()])
#my_text1 = replace_all(my_text, word_dict)
my_text1

', Male, SF0063198200 Family History Relative No. Status Age Medical History Onset COD Note Father living 66 Insomnia P 3, L30 Mother living 69 Insomnia P 3, L29 Retinal Disease P 3, L29 Migraine P 52, L30 Cataracts P 87, L13 maternal stroke P 52, L31 grandfather APS.Extract® All Rights Reserved This document contains proprietary, confidential and personal data and is intended for viewing and use only by authorized users. This document pay not be printed or distributed for any unauthorized purpose whatsoever. Copyright (c) 2020. All rights reserved. Innodata Synodex, LLC'

##ICD-10 code searching

In [None]:
%%time

page_keyword_dict = {}
# Step-4: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    page_txt = f.read()
    # filter the page that have line number instead of code
    #if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
    doc = nlp(page_txt)
    matches = phrase_matcher(doc)

    keyword_list = []
    for match_id, start, end in matches:
      span = doc[start: end]
      keyword_list.append(f"{span}")

    if len(keyword_list) != 0:
      page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
      page_keyword_dict[page_number] = keyword_list
      print(f"Page[{txt_file.split('/')[1]}]: {keyword_list}")

Page[page-0.txt]: ['Alcohol Use', 'alcohol use', 'Anxiety Disorder', 'Aphasia', 'Cerebral Infarction', 'Cerebrovascular Disease', 'Headache', 'Insomnia', 'Spleen', 'Migraine', 'Palpitations']
Page[page-1.txt]: ['Insomnia', 'Insomnia', 'Migraine']
Page[page-2.txt]: ['Lipomatosis', 'Chronic', 'Spleen', 'Unspecified', 'Unspecified', 'Chronic', 'chronic', 'chronic', 'Moderate']
Page[page-3.txt]: ['depression', 'Radiculopathy', 'Low back pain', 'Multiple sites', 'Sciatica', 'Palpitations', 'Palpitations', 'palpitations', 'Palpitations', 'Palpitations', 'Palpitations', 'Incomplete', 'Palpitations', 'Incomplete']
Page[page-4.txt]: ['Surgical Procedure', 'Surgical Procedure', 'Surgical Procedure', 'Surgical Procedure', 'Surgical Procedure', 'Surgical Procedure', 'Aphasia', 'Migraine', 'Wrestling', 'left upper extremity', 'wrestling', 'Constipation', 'Surgical Procedure', 'Surgical Procedure', 'Alcohol Use', 'Alcohol use', 'Surgical Procedure', 'Surgical Procedure', 'Lifting', 'Insomnia', 'Over

##ICD-10 code highlighting

In [None]:
%%time

# Step-4: Highlighting ICD-10 code into pdf
highlight_icd10_keyword(page_keyword_dict, pdf_file_name)

Page-0:  Alcohol Use [Rect(314.5531005859375, 263.89727783203125, 357.23309326171875, 274.8892822265625), Rect(408.955322265625, 263.89727783203125, 449.4193115234375, 274.8892822265625)]
Page-0:  Alcohol Use [Rect(314.5531005859375, 263.89727783203125, 357.23309326171875, 274.8892822265625), Rect(408.955322265625, 263.89727783203125, 449.4193115234375, 274.8892822265625)]
Page-0:  alcohol use [Rect(314.5531005859375, 263.89727783203125, 357.23309326171875, 274.8892822265625), Rect(408.955322265625, 263.89727783203125, 449.4193115234375, 274.8892822265625)]
Page-0:  alcohol use [Rect(314.5531005859375, 263.89727783203125, 357.23309326171875, 274.8892822265625), Rect(408.955322265625, 263.89727783203125, 449.4193115234375, 274.8892822265625)]
Page-0:  Anxiety Disorder [Rect(31.346466064453125, 397.4663391113281, 90.02647399902344, 408.4583435058594)]
Page-0:  Aphasia [Rect(31.346466064453125, 409.7332458496094, 60.250465393066406, 420.7252502441406)]
Page-0:  Cerebral Infarction [Rect(3

In [None]:
True if re.search("[0-9],L", "3, L14") else False

False

In [None]:
True if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", "P 13, L17") else False

True

In [None]:
pattern = re.compile("(P[ ][0-9]+)(,\s)(L[0-9]+)")

for i, line in enumerate(open("txt-files/pdf-page-3.txt")):
  for match in re.finditer(pattern, line):
    print('Found on line %s: %s' % (i+1, match.group()))

Found on line 2: P 60, L23
Found on line 3: P 61, L14
Found on line 4: P 56, L19
Found on line 7: P 54, L19
Found on line 8: P 49, L5
Found on line 9: P 39, L6
Found on line 10: P 37, L9
Found on line 11: P 35, L14
Found on line 12: P 27, L24
Found on line 13: P 21, L23
Found on line 15: P 11, L20
Found on line 16: P 5, L39
Found on line 18: P 3, L24
Found on line 22: P 77, L38
Found on line 27: P 86, L19
Found on line 29: P 66, L19
Found on line 30: P 29, L19
Found on line 31: P 21, L16
Found on line 32: P 12, L18
Found on line 33: P 3, L22
Found on line 34: P 3, L22
Found on line 38: P 3, L17
Found on line 41: P 79, L31


In [None]:
# Step-5: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    #one_txt = f.read()
    doc = nlp(f.read())
    code_list = [ent.text for ent in doc.ents]
    if len(code_list) != 0:
      print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    #print([f"Page[{txt_file.split('/')[1]}]: {(ent.text, ent.label_)}" for ent in doc.ents if ent is not None])

Page[pdf-page-2.txt]: ['L14']
Page[pdf-page-3.txt]: ['L14', 'L14', 'L22', 'L22']
Page[pdf-page-4.txt]: ['L26', 'L26', 'L26', 'L26', 'L26', 'L22', 'L14', 'L22', 'L14']
Page[pdf-page-6.txt]: ['L14', 'L14']
Page[pdf-page-7.txt]: ['L14', 'L14', 'L14', 'L14']
Page[pdf-page-8.txt]: ['L26', 'L26', 'L26']
Page[pdf-page-9.txt]: ['L26', 'L26', 'L22', 'L26', 'L22', 'L26', 'L22', 'L22', 'L22']
Page[pdf-page-10.txt]: ['L22', 'L26', 'L14']
Page[pdf-page-11.txt]: ['L22', 'L22', 'L22']
Page[pdf-page-12.txt]: ['L14']
Page[pdf-page-18.txt]: ['M54.40']
Page[pdf-page-19.txt]: ['G43.109']
Page[pdf-page-26.txt]: ['D17.1', 'F43.9']
Page[pdf-page-37.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30', 'E88.2']
Page[pdf-page-39.txt]: ['G43.109', 'D72.819', 'R00.2', 'R10.30']
Page[pdf-page-42.txt]: ['G43.109', 'R00.2', 'G43.109', 'D72.819', 'R00.2']
Page[pdf-page-74.txt]: ['M54.2', 'R42']
Page[pdf-page-82.txt]: ['G43.109', 'R00.2', 'L25.9', 'D72.819']
Page[pdf-page-84.txt]: ['G43.109']
Page[pdf-page-85.txt]: ['L25.9

##Tessrect text extraction

In [None]:
%%time

# Step-1: spliting pdf file
pdf_file_name = "Redacted_Sample_2.pdf"
pdf_list = split_pdf(pdf_file_name)

# Step-2: converting pdf to image
img_list = pdf_to_image(pdf_list)

# Step-4: Extracting text from pdf
txt_list = extract_text_from_image(img_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

In [None]:
# Step-4: Extracting text from pdf
txt_list = extract_text_from_image(img_list)

# Step-3: loading and updating patterns to Spacy
nlp.add_pipe("entity_ruler").from_disk("./icd10_code_patterns.jsonl")

<spacy.pipeline.entityruler.EntityRuler at 0x7f4ac6e88af0>

In [None]:
%%time

pdf_page_dict = {}
# Step-4: Searching ICD-10 code
for txt_file in txt_list:
  with open(txt_file, "r") as f:
    page_txt = f.read()
    # filter the page that have line number instead of code
    if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
      doc = nlp(page_txt)
      code_list = [ent.text for ent in doc.ents]
      if len(code_list) != 0:
        page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
        pdf_page_dict[page_number] = code_list
        print(f"Page[{txt_file.split('/')[1]}]: {code_list}")

Page[page-17.txt]: ['R64']
Page[page-18.txt]: ['D17.1']
Page[page-19.txt]: ['G43.109']
Page[page-24.txt]: ['M50.30']
Page[page-26.txt]: ['F43.9']
Page[page-37.txt]: ['G43.109', 'D72.819', 'R10.30', 'E88.2']
Page[page-39.txt]: ['G43.109', 'D72.819', 'R10.30']
Page[page-42.txt]: ['G43.109', 'D72.819']
Page[page-74.txt]: ['M50.323', 'M46.92', 'R42']
Page[page-82.txt]: ['G43.109', 'L25.9', 'D72.819']
Page[page-85.txt]: ['R00.2', 'L25.9', 'D72.819']
Page[page-102.txt]: ['G43.109', 'L25.9']
Page[page-105.txt]: ['G43.109', 'L25.9']
CPU times: user 676 ms, sys: 15.5 ms, total: 691 ms
Wall time: 694 ms


In [None]:
%%time

# Step-4: Highlighting ICD-10 code into pdf
highlight_icd10_code(pdf_page_dict, pdf_file_name)

Page dict: ['R64']
Page dict: ['D17.1']
Page dict: ['G43.109']
Page-19:  G43.109 [Rect(227.2899932861328, 145.4748077392578, 256.4548034667969, 156.68040466308594)]
Page dict: ['M50.30']
Page dict: ['F43.9']
Page-26:  F43.9 [Rect(267.1199951171875, 294.4976806640625, 288.10455322265625, 306.6370849609375)]
Page dict: ['G43.109', 'D72.819', 'R10.30', 'E88.2']
Page-37:  D72.819 [Rect(70.31879425048828, 230.57130432128906, 100.93679809570312, 241.70071411132812)]
Page-37:  R10.30 [Rect(70.31879425048828, 248.33460998535156, 96.43319702148438, 259.4640197753906)]
Page-37:  E88.2 [Rect(70.31879425048828, 257.4552307128906, 91.48410034179688, 268.5846252441406)]
Page dict: ['G43.109', 'D72.819', 'R10.30']
Page-39:  G43.109 [Rect(68.39816284179688, 293.9032287597656, 98.70294952392578, 304.7579650878906)]
Page-39:  D72.819 [Rect(69.1170654296875, 311.899658203125, 98.97944641113281, 322.75439453125)]
Page-39:  R10.30 [Rect(69.1170654296875, 329.643310546875, 94.58699035644531, 340.498046875)]

In [None]:
with open(f"{txt_files_path}/page-37.txt", "r") as f:
  one_txt = f.read()
  print(one_txt)

, Male,

SF0063198200

Patient Name (005 - Phone Number:

7

4of6

Office/Outpatient Visit

Visit Date: Mon, Jun 24, 2019 9:00 am

Provider: Josephs, Barry, MD (Assistant: Coppage, Jasmine, )

Location: Barry Josephs, M.D. LLG

Electronically signed by Barry Josephs, MD on 06/24/2019 02:46:23 PM

Printed on 06/24/2019 at 3:21 pm.

Assessment:

¥70.0 Preventive Physical Exam (Mild)

434.91 Cerebrovascular accident (Severe)

G43.109 Migraine with aura, not intractable, without status migrainosus

722.91

Cervical disc disorder (Mild)

D72.819 Decreased white blood cell count, unspecified

ROO.2 Palpitations

R10.30 Lower abdominal pain, unspecified

E88.2 Lipomatosis, not elsewhere classified

Plan:

Preventive Physical Exam

PREVENTION & RECOMMENDATIONS:

Overall Health and Lifestyle:

Thank you for coming for the Wellness Program. It was a pleasure to update the history and physical as well as discuss

your laboratory results

Wellness anc Prevention begins with a healthy lifestyle and