<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/13_icd_10_code_highlight_with_keyword_match_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [20]:
import pandas as pd
import re
import os
import sys
import glob
import difflib
import pickle
from pathlib import Path
from difflib import SequenceMatcher

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk
from nltk.tokenize import sent_tokenize
from string import punctuation
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
nltk.download('punkt')

In [2]:
!mkdir -p input_files

##Core Classes

In [3]:
class Highlighter:
  def __init__(self, code_df):
      # loading and updating patterns for ICD-10 code
      self.nlp_code10 = English()
      self.nlp_code10.add_pipe("entity_ruler").from_disk("icd10_code_patterns-v5.jsonl")

      # define icd-10 code dataset
      self.code_df = code_df
      self.text_list = None

      # define required directory path
      self.PDF_FILES_PATH = "pdf-files"
      self.TXT_FILES_PATH = "txt-files"
      self.OUTPUT_FILES_PATH = "output"
      create_directory(self.PDF_FILES_PATH)
      create_directory(self.TXT_FILES_PATH)
      create_directory(self.OUTPUT_FILES_PATH)


  def split_pdf(self, pdf_path):
      pdf_in_file = open(pdf_path, "rb")
      pdf = PdfReader(pdf_in_file)
      pdf_list = []
      for page in range(len(pdf.pages)):
          input_pdf = PdfReader(pdf_in_file)
          output = PdfWriter()
          #output.addPage(input_pdf.getPage(page))
          output.add_page(input_pdf.pages[page])
          with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
              output.write(outputStream)
              pdf_list.append(f"page-{page}.pdf")
      return pdf_list

  def extract_text_from_pdf(self, pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
      with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
        pdf = pdftotext.PDF(f)

      # Read all the text into one string
      pdf_text = "\n\n".join(pdf)

      # write text into file
      with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
        f.write(pdf_text)
      txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
      i += 1
    self.text_list = txt_file_list
    return txt_file_list

  def highlight_icd_code(self, icd10_code_dict, pdf_file_name=None, cords_file_name=None):
      pdf_file = fitz.open(pdf_file_name)
      # create file to write coordinate
      txt_output_file_name = open(f"{self.OUTPUT_FILES_PATH}/{cords_file_name}", "a")

      def highlight_pdf(highlight, icd10_code, num_page):
          cords_list = []
          keyword = ""
          score = 0.0
          for inst in highlight:
            highlight = page.add_highlight_annot(inst)

            # do the color coding
            keyword = self.get_keyword(icd10_code)
            if len(keyword) > 0:
              score = get_similarity_score(keyword, self.text_list[num_page])
              if score > 0.10:
                highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
              else:
                highlight.set_colors(stroke=[1, 0.8, 0.8])  # light red
              """
              if score > 0.50:
                highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
              else:
                highlight.set_colors(stroke=[0.5, 1, 1])  # light aqua
              """
            
            highlight.update()
            highlight = page.search_for(icd10_code)
            cords_list.append(highlight)

          if cords_list:
            num_page = page_num + 1
            # code_cors_output = f"Page-{num_page} | {icd10_code} | {reverse_code_pattern(icd10_code)} | {keyword} | {score}  | {cords_list} \n"
            code_cors_output = f"Page-{num_page} | {icd10_code} | {reverse_code_pattern(icd10_code)} | {keyword if keyword else 'Not available'} | {round(score, 3)}  | \n"
            txt_output_file_name.write("%s\n" % code_cors_output)

      for page_num, page in enumerate(pdf_file):
        # highlight ICD-10 code
        if page_num in icd10_code_dict:
          for code in icd10_code_dict[page_num]:
            highlight = page.search_for(code)
            if len(highlight) == 0:
              alternate_code_list = self.get_opt_pattern(code)
              
              for alt_code in alternate_code_list:
                highlight = page.search_for(alt_code)
                # highlight pdf for option pattern
                highlight_pdf(highlight, alt_code, page_num)
            # highlight pdf for main pattern
            highlight_pdf(highlight, code, page_num)

      txt_output_file_name.close()

      pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_output.pdf"
      pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

      return pdf_output_file_name, cords_file_name

  def get_opt_pattern(self, icd_10_code):
    # create alternate pattern
    code_arr = icd_10_code.split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"
      return [code1, code2, code3]
    else:
      return icd_10_code

  def search_icd_code(self, txt_list):
    pdf_page_vocab = {}
    for txt_file in txt_list:
      with open(txt_file, "r") as f:
        page_txt = f.read()

        # check the page that have line number instead of code
        index_page = False
        if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
          index_page = True

        doc = self.nlp_code10(page_txt)
        code_list = []
        for ent in doc.ents:
          if index_page:
            # check the code contain letter "L"
            if re.search("(L[0-9]+)", ent.text):
              continue
            else:
              code_list.append(ent.text)
          else:
            code_list.append(ent.text)

        #code_list = [ent.text for ent in doc.ents if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", ent.text)]
        if len(code_list) != 0:
            page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
            pdf_page_vocab[page_number] = list(set(code_list)) 
            # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    return pdf_page_vocab

  def get_keyword(self, p_code):
    keyword = ""
    # reverse code if required
    code = reverse_code_pattern(p_code)
    # get keyword from dataset
    keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
    if len(keyword_list) > 0:
      keyword = keyword_list[0]
    return keyword

def get_similarity_score(keyword, text_file):
  # load text file
  with open(text_file, "r") as f:
    my_text = f.read()

  # prepare key phrase
  key_phrase_list = []
  for textlist in my_text.split("\n"):
    for key_phrase in textlist.split(","):
      if len(key_phrase) > 0:
        key_phrase_list.append(key_phrase)
        
  # return max similarity score
  return max([SequenceMatcher(None, k_phrase, keyword).ratio() for k_phrase in key_phrase_list])

def reverse_code_pattern(p_code):
  orig_code = p_code

  # check for code contains space(" ")
  tmp_code = orig_code.split(" ")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

  # check for code contains dot(".")
  tmp_code = p_code.split(".")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  
  # check for code contains comma(",")
  tmp_code = p_code.split(",")
  if len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  elif len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

  # handle if the first char of code is missing
  alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
  for key, val in alphabats.items():
    # replcae char on 0 index
    if orig_code.find(val) == 0:
      #orig_code = orig_code.replace(val, key)
      orig_code = replacer(orig_code, key, 0)
    # replcae char on 1 index
    if orig_code.find(key) == 1:
      orig_code = replacer(orig_code, val, 1)
      # replcae char on 2 index
      if orig_code.find(key) == 2:
        orig_code = replacer(orig_code, val, 2)
      break

  return orig_code

def replacer(s, newstring, index, nofail=False):
  # raise an error if index is outside of the string
  if not nofail and index not in range(len(s)):
      raise ValueError("index outside given string")

  # if not erroring, but the index is still not in the correct range..
  if index < 0:  # add it to the beginning
      return newstring + s
  if index > len(s):  # add it to the end
      return s + newstring

  # insert the new string between "slices" of the original
  return s[:index] + newstring + s[index + 1:]

def create_directory(dir_name):
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

##Class-based Searching & Highlighting

In [None]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [None]:
# Step-0: Define prerequisite instance
INPUT_PDF_FILES_PATH = "input_pdf_files_path"

highlighter = Highlighter()

In [None]:
%%time

for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 cod
  icd10_code_dict = highlighter.search_icd_code(txt_list)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

In [None]:
!rm -rf input_pdf_files_path
!mkdir -p input_pdf_files_path
!rm -rf output

In [None]:
!rm -rf output

In [None]:
!zip output.zip output/*.*

##Keyword Matching & Highlighting 

- Step 1 - Z87.5
- Step 2 - Personal history of complications of pregnancy, childbirth and the puerperium
- Step 3 - Page keyword
- Step 4 - calculate cosine similirity
- Step 5 - "Green" > 60% otherwise "Yellow"

In [4]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [5]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

highlighter = Highlighter(code_df)

In [None]:
%%time

for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 cod
  icd10_code_dict = highlighter.search_icd_code(txt_list)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

In [None]:
!rm -rf input_files
!mkdir -p input_files
!rm -rf output
!mkdir -p output

In [None]:
!zip output.zip output/*.*

In [6]:
# Step-1: spliting pdf file
pdf_file_name = "01_final.pdf"
pdf_list = highlighter.split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = highlighter.extract_text_from_pdf(pdf_list)

In [7]:
# Step-3: Searching ICD-10 code
page_code10_dict = highlighter.search_icd_code(txt_list)

###Clean Code

In [8]:
page_code10_dict[7]

['R06.83', 'G47.34', 'RO6.3', 'G47.8', 'G47.33', 'G47.10', 'G47.31', 'G47,39']

In [8]:
clean_icd10_code = [reverse_code_pattern(code) for code in page_code10_dict[7]]
clean_icd10_code

['G47.33', 'G47.10', 'G47.34', 'G47.39', 'R06.83', 'G47.8', 'R06.3', 'G47.31']

###Fetch Keyword

In [9]:
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

In [15]:
keyword_df = code_df.loc[code_df["Code"] == "R06.83"]
keyword_df

Unnamed: 0,Code,Keyword
29843,R06.83,Snoring


In [17]:
list(code_df.loc[code_df["Code"] == "R06.83"]["Keyword"])

['Snoring']

In [10]:
def get_keyword(p_code):
  keyword = ""
   # reverse code if required
  code = reverse_code_pattern(p_code)
  # get keyword from dataset
  keyword_list = list(code_df.loc[code_df["Code"] == code]["Keyword"])
  if len(keyword_list) > 0:
    keyword = keyword_list[0]
  return keyword

In [11]:
get_keyword("R06.83")

'Snoring'

###Find keyword match

In [21]:
def get_best_match(text_file, keyword):
  with open(text_file) as f:
    lines = [line.rstrip('\n') for line in f]

    ratios = [fuzz.ratio(keyword, line) for line in lines]
    best_match = lines[ratios.index(max(ratios))]
    print(f"{lines.index(best_match) + 1}:{keyword}, {fuzz.partial_ratio(keyword, best_match)}: {best_match}")

In [66]:
with open(f"{txt_list[7]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  lines = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
lines

['Redacted person nine, Male, 48 SF0102456023',
 '_',
 'S (R| S] slslels(elalele/s|slslel™i[~lrrrl"tel"l>',
 '2/=|e]™',
 'Reb/2 1/2022 10:18:05 AM North Texas Neuroscience Cente 4603725317 THO',
 ']',
 '[ ~]',
 '_= Sunil Mathews, M.D., P.A.',
 '. e ’ . . ;',
 'NeTessc',
 'BI“|[8l/slalslalalez|/e/a]sla]e,"=',
 'NerthyTaxas Neurmacienee ane sinap Gantmr, BA Diplomate in American Board of Psychiatry and Neurolagy (ABPN)',
 'Diplomate in ABPN « Clinical Neurophysiology (EMG/EEG)',
 'Diplomate in ABPN - Sleep Medicine',
 'PHYSICIAN’S INTERPRETATION OF SLEEP STUDY',
 "pT . Chart Number: 0073 . Height: 5'10”",
 's',
 'Sex: Male ] Weight: 238 Ibs',
 'Referring Physiclan: Sunil Mathews, MD Epworth Score: 8 Neck Size: 16”',
 'Clinical Reason for Study: Suspected Sleep Apnea | Study Type: WatchPAT HST Study Date: 12/02/2021',
 'Methods/Technique: The standards of the American Acadenty of Sleep. medicine were followéd in monitoring the patient. The. Home Sleep Test',
 'included.rerording.of periph

In [None]:
def get_similarity_score(keyword, text_file):
  # load text file
  with open(text_file, "r") as f:
    my_text = f.read()

  # prepare key phrase
  key_phrase_list = []
  for textlist in my_text.split("\n"):
    for key_phrase in textlist.split(","):
      if len(key_phrase) > 0:
        key_phrase_list.append(key_phrase)
  # get max similarity score
  score_list = [SequenceMatcher(None, k_phrase, keyword).ratio() for k_phrase in key_phrase_list]
  max_score = max(score_list)
  # get index position
  index_pos = score_list.index(max_score)
  # get most similar phrase
  most_similar_phrase = key_phrase_list[index_pos]
  return max_score, most_similar_phrase

In [None]:
get_similarity_score("Snoring", txt_list[7])

(0.3333333333333333, ' octigraphy')

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  my_text = f.read()
for keyword in my_text.split(","):
  seq = SequenceMatcher(None, keyword, "Snoring")
  print(f"{round(seq.ratio(), 3)} : {keyword}")
  if round(seq.ratio(), 3) > .70:
    print(f"Max ratio found: {round(seq.ratio(), 3)} : {keyword}")

##Text clean up

In [32]:
with open(f"{txt_list[7]}", "r") as f:
  my_text = f.read()

In [26]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(my_text)
sentences = [sentence.text for sentence in doc.sents]

In [None]:
sentences

In [27]:
def text_preprocess(text_file):
  sentence_list = []
  stopwords = ["is", "a"]
  doc = nlp(my_text)
  sentences = [sentence.text for sentence in doc.sents]
  for sent in sentences:
    clean_text = " ".join(sent.split())  # Remove extra spaces, tabs, and line breaks
    clean_text = re.sub(f"[{re.escape(punctuation)}]", "", clean_text) # Remove punctuation
    clean_text = re.sub(r"\b[0-9]+\b\s*", "", clean_text)     # Remove numbers
    clean_text = " ".join([w for w in clean_text.split() if not w.isdigit()]) # Remove digits= Side effect: removes extra spaces
    clean_text = " ".join([w for w in clean_text.split() if w.isalpha()]) # Remove non-alphabetic characters= Side effect: removes extra spaces
    clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", clean_text) # Remove all special characters and punctuation
    # Remove stopwords from a list
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if not t in stopwords]
    clean_text = " ".join(clean_tokens)
    # Remove short tokens
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if len(t) > 3]
    clean_text = " ".join(clean_tokens)
    # Remove repeated characters
    clean_text = re.sub(r'(.)\1{3,}',r'\1', clean_text)
    if len(clean_text) > 0:
      sentence_list.append(clean_text)
  return sentence_list

In [33]:
sentence_list = text_preprocess(my_text)
sentence_list

['Redacted person nine Male North Texas Neuroscience Cente',
 'Sunil Mathews',
 'NeTessc NerthyTaxas Neurmacienee sinap Gantmr Diplomate American Board Psychiatry Neurolagy ABPN Diplomate ABPN Clinical Neurophysiology EMGEEG',
 'Diplomate ABPN Sleep Medicine INTERPRETATION SLEEP STUDY',
 'Chart Number',
 'Height Male Weight Referring Physiclan Sunil Mathews Epworth Score Neck Size Clinical Reason Study Suspected Sleep Apnea Study Type WatchPAT Study Date MethodsTechnique standards American Acadenty Sleep',
 'medicine were followd monitoring patient',
 'Home Sleep Test includedrerordingof peripheral arterial tone oxygen saturation heart rata octigraphy body position snoring',
 'RBland were caleuleted from this data',
 'Tatal slecp awake time NREM sleap were assassad',
 'Minimal oxygen saturation Moderate Obstructive Sleep Apnea',
 'Severe Related Respiratory Events Hypoxemia Loud Disruptive Snoring glalsslelelselelslelelssleles Obesity Reduced Sleep Ffficiancy minutes hurts',
 'Excessiv

In [19]:
with open(f"{txt_list[7]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
sentence_list

['Redacted person nine, Male, 48 SF0102456023',
 '_',
 'S (R| S] slslels(elalele/s|slslel™i[~lrrrl"tel"l>',
 '2/=|e]™',
 'Reb/2 1/2022 10:18:05 AM North Texas Neuroscience Cente 4603725317 THO',
 ']',
 '[ ~]',
 '_= Sunil Mathews, M.D., P.A.',
 '. e ’ . . ;',
 'NeTessc',
 'BI“|[8l/slalslalalez|/e/a]sla]e,"=',
 'NerthyTaxas Neurmacienee ane sinap Gantmr, BA Diplomate in American Board of Psychiatry and Neurolagy (ABPN)',
 'Diplomate in ABPN « Clinical Neurophysiology (EMG/EEG)',
 'Diplomate in ABPN - Sleep Medicine',
 'PHYSICIAN’S INTERPRETATION OF SLEEP STUDY',
 "pT . Chart Number: 0073 . Height: 5'10”",
 's',
 'Sex: Male ] Weight: 238 Ibs',
 'Referring Physiclan: Sunil Mathews, MD Epworth Score: 8 Neck Size: 16”',
 'Clinical Reason for Study: Suspected Sleep Apnea | Study Type: WatchPAT HST Study Date: 12/02/2021',
 'Methods/Technique: The standards of the American Acadenty of Sleep. medicine were followéd in monitoring the patient. The. Home Sleep Test',
 'included.rerording.of periph

In [13]:
def get_best_match(sentence_list, keyword):
  print("#"*10)
  print(f"Matching for : {keyword}")
  print()
  ratios = [fuzz.ratio(keyword, sentence) for sentence in sentence_list]
  best_match = sentence_list[ratios.index(max(ratios))]
  print(f"Best match: {fuzz.ratio(keyword, best_match)} | {best_match}")
  print(f"Before Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) - 1])} | {sentence_list[sentence_list.index(best_match) - 1]}")
  print(f"After Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) + 1])} | {sentence_list[sentence_list.index(best_match) + 1]}")
  print()

  p_ratios = [fuzz.partial_ratio(keyword, sentence) for sentence in sentence_list]
  p_best_match = sentence_list[p_ratios.index(max(p_ratios))]
  print(f"Partial Best match: {fuzz.partial_ratio(keyword, p_best_match)} | {p_best_match}")
  print(f"Partial Before Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) - 1])} | {sentence_list[sentence_list.index(p_best_match) - 1]}")
  print(f"Partial After Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) + 1])} | {sentence_list[sentence_list.index(p_best_match) + 1]}")
  print()

In [14]:
keyword_list = [get_keyword(code) for code in clean_icd10_code]
keyword_list

['Obstructive sleep apnea (adult) (pediatric)',
 'Hypersomnia, unspecified',
 'Idio sleep related nonobstructive alveolar hypoventilation',
 'Other sleep apnea',
 'Snoring',
 'Other sleep disorders',
 'Periodic breathing',
 'Primary central sleep apnea']

In [17]:
[get_best_match(sentence_list, keyword) for keyword in keyword_list]

##########
Matching for : Obstructive sleep apnea (adult) (pediatric)

Best match: 60 | obstructive sleep apnea (G47.33) (]Steep-related hypoventilation (G47.34)
Before Match: 4 | DIAGNOSIS:
After Match: 23 | ClPrimary snoring (R06.83) $Sieap eclated breathing disorder (G47,39)

Partial Best match: 100 | s
Partial Before Match: 19 | pT . Chart Number: 0073 . Height: 5'10”
Partial After Match: 30 | Sex: Male ] Weight: 238 Ibs

##########
Matching for : Hypersomnia, unspecified

Best match: 54 | (central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10)
Before Match: 18 | Cl Upper airway resistance (G47.8) fPerladic Limb. Movement in Sleep (G47,64)
After Match: 15 | [Cheyne-Stokes respiration (RO6.3) [JOther: Click hare to enter text,

Partial Best match: 100 | s
Partial Before Match: 19 | pT . Chart Number: 0073 . Height: 5'10”
Partial After Match: 12 | Sex: Male ] Weight: 238 Ibs

##########
Matching for : Idio sleep related nonobstructive alveolar hypoventilation

Best match: 4

[None, None, None, None, None, None, None, None]

In [36]:
def get_best_match(sentence_list, keyword_list):
  for keyword in keyword_list:
    #for sentence in sentence_list:
    print("#"*10)
    print(f"Matching for : {keyword}")
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    for match_found in match_list:
      print(f"{match_found[0]} | {match_found[1]}")
    print()

In [37]:
get_best_match(sentence_list, keyword_list)
#process.extract(query, choices, scorer = fuzz.partial_ratio, limit = 2)

##########
Matching for : Obstructive sleep apnea (adult) (pediatric)
Moderate Obstructive Sleep Apnea . | 62
obstructive sleep apnea (G47.33) (]Steep-related hypoventilation (G47.34) | 61
(central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10) | 44

##########
Matching for : Hypersomnia, unspecified
(central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10) | 55
Diplomate in ABPN - Sleep Medicine | 45
Hypoxemia | 42

##########
Matching for : Idio sleep related nonobstructive alveolar hypoventilation
Moderate Obstructive Sleep Apnea . | 49
obstructive sleep apnea (G47.33) (]Steep-related hypoventilation (G47.34) | 49
BI“|[8l/slalslalalez|/e/a]sla]e,"= | 34

##########
Matching for : Other sleep apnea
Moderate Obstructive Sleep Apnea . | 61
Diplomate in ABPN - Sleep Medicine | 47
s Excessive Daytime Sleepiness | 43

##########
Matching for : Snoring
Loud Disruptive Snoring | 47
DIAGNOSIS: | 38
[SIS[RBlS | 27

##########
Matching for : Other sleep disorders
Diplomate in

##Fuzzy

In [None]:
!pip install fuzzywuzzy

In [None]:
from fuzzywuzzy import fuzz

In [None]:
str1 = 'This document printed distributed unauthorized purpose whatsoever'
str2 = 'Other disorders of amniotic fluid and membranes'

ratio = fuzz.ratio(str1, str2)
partial_ratio = fuzz.partial_ratio(str1, str2)

print(ratio)
print(partial_ratio)

30
28


In [None]:
for keyword in sentence_list:
  ratio = fuzz.partial_ratio(keyword, "Snoring")
  print(f"{round(ratio, 3)} : {keyword}")

43 : Redacted person nine Male North Texas Neuroscience Cente
43 : Sunil Mathews
43 : NeTessc NerthyTaxas Neurmacienee sinap Gantmr Diplomate American Board Psychiatry Neurolagy ABPN Diplomate ABPN Clinical Neurophysiology EMGEEG
29 : Diplomate ABPN Sleep Medicine INTERPRETATION SLEEP STUDY
14 : Chart Number
29 : Height Male Weight Referring Physiclan Sunil Mathews Epworth Score Neck Size Clinical Reason Study Suspected Sleep Apnea Study Type WatchPAT Study Date MethodsTechnique standards American Acadenty Sleep
71 : medicine were followd monitoring patient
86 : Home Sleep Test includedrerordingof peripheral arterial tone oxygen saturation heart rata octigraphy body position snoring
29 : RBland were caleuleted from this data
14 : Tatal slecp awake time NREM sleap were assassad
29 : Minimal oxygen saturation Moderate Obstructive Sleep Apnea
100 : Severe Related Respiratory Events Hypoxemia Loud Disruptive Snoring glalsslelelselelslelelssleles Obesity Reduced Sleep Ffficiancy minutes hur

In [None]:
def get_best_match(text_file, keyword):
  with open(text_file) as f:
    names = [line.rstrip('\n') for line in f]
    ratios = [fuzz.ratio(keyword, name) for name in names]
    best_match = names[ratios.index(max(ratios))]
    print(fuzz.partial_ratio(name_to_match,best_match))
    print(best_match)
  return best_match

In [None]:
get_best_match(txt_list[7], "Primary central sleep apnea")

34
               APS.Extract® All Rights Reserved


'               APS.Extract® All Rights Reserved'

In [None]:
with open(txt_list[7]) as f:
    names = [line.rstrip('\n') for line in f]
#for names in lines:
#    print (names[0])
    icdkeywords = ['Obstructive sleep apnea','Primary central sleep apnea']
    #name_to_match = 'Sleep related hypoventilation'
    #name_to_match = 'ZZZZZ'
    #print ("Names:", names)
    #print ("name to match:", name_to_match)
    #ratios = [fuzz.ratio(name_to_match, names)]
    for name_to_match in icdkeywords:
        ratios = [fuzz.ratio(name_to_match, name) for name in names]
        print(ratios)
        print(max(ratios))
        best_match = names[ratios.index(max(ratios))]
        print(fuzz.partial_ratio(name_to_match, best_match))
        print(fuzz.token_set_ratio(name_to_match, best_match))
        print(best_match)

[3, 0, 19, 1, 9, 0, 0, 3, 3, 4, 4, 13, 6, 11, 3, 3, 1, 2, 7, 15, 10, 1, 15, 1, 9, 23, 1, 3, 6, 5, 3, 12, 1, 7, 13, 4, 15, 12, 2, 0, 8, 11, 19, 13, 8, 19, 5, 12, 7, 17, 2, 22, 10, 13, 15, 10, 5, 8, 4, 5, 5, 4, 2, 3, 20, 1, 20]
23
91
100
                                                                                                    Moderate Obstructive Sleep Apnea                       .
[3, 0, 21, 1, 7, 0, 0, 5, 4, 4, 3, 9, 10, 13, 5, 6, 2, 6, 11, 14, 15, 2, 19, 2, 14, 16, 2, 5, 10, 6, 3, 4, 1, 4, 13, 6, 11, 11, 6, 1, 7, 7, 13, 14, 7, 17, 13, 15, 9, 17, 6, 15, 15, 14, 20, 11, 7, 9, 3, 7, 6, 4, 2, 5, 27, 4, 22]
27
34
34
               APS.Extract® All Rights Reserved


In [None]:
# Remove extra spaces, tabs, and line breaks
clean_text = " ".join(my_text.split())
clean_text

In [None]:
# Remove punctuation
clean_text = re.sub(f"[{re.escape(punctuation)}]", "", clean_text)
clean_text

In [None]:
# Remove numbers
clean_text = re.sub(r"\b[0-9]+\b\s*", "", clean_text)
clean_text

In [None]:
# Remove digits
clean_text = " ".join([w for w in clean_text.split() if not w.isdigit()]) # Side effect: removes extra spaces
clean_text

In [None]:
# Remove non-alphabetic characters
clean_text = " ".join([w for w in clean_text.split() if w.isalpha()]) # Side effect: removes extra spaces
clean_text

In [None]:
# Remove all special characters and punctuation
clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", clean_text)
clean_text

In [None]:
# Remove stopwords from a list
stopwords = ["is", "a"]
tokens = clean_text.split()
clean_tokens = [t for t in tokens if not t in stopwords]
clean_text = " ".join(clean_tokens)

In [None]:
clean_text

In [None]:
# Remove short tokens
tokens = clean_text.split()
clean_tokens = [t for t in tokens if len(t) > 1]
clean_text = " ".join(clean_tokens)
clean_text

In [None]:
# Remove repeated characters
clean_text = re.sub(r'(.)\1{3,}',r'\1', clean_text)
clean_text