<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/13_icd_10_code_highlight_with_keyword_match_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

!pip install pillow

!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install -U pdftotext
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF

In [None]:
!pip install fuzzywuzzy

In [1]:
import pandas as pd
import re
import os
import sys
import glob
import difflib
import pickle
from pathlib import Path
from difflib import SequenceMatcher

import fitz
import pdftotext
from PyPDF2 import PdfFileReader, PdfReader, PdfFileWriter, PdfWriter

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk
from nltk.tokenize import sent_tokenize
from string import punctuation
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [None]:
nltk.download('punkt')

In [2]:
!mkdir -p input_files

##Core Classes

In [2]:
class Highlighter:
  def __init__(self, code_df):
    # loading and updating patterns for ICD-10 code
    self.nlp_code10 = English()
    self.nlp_code10.add_pipe("entity_ruler").from_disk("icd10_code_patterns-v5.jsonl")

    # define icd-10 code dataset
    self.code_df = code_df
    self.text_list = None

    # define required directory path
    self.PDF_FILES_PATH = "pdf-files"
    self.TXT_FILES_PATH = "txt-files"
    self.OUTPUT_FILES_PATH = "output"
    create_directory(self.PDF_FILES_PATH)
    create_directory(self.TXT_FILES_PATH)
    create_directory(self.OUTPUT_FILES_PATH)

  def split_pdf(self, pdf_path):
      pdf_in_file = open(pdf_path, "rb")
      pdf = PdfReader(pdf_in_file)
      pdf_list = []
      for page in range(len(pdf.pages)):
          input_pdf = PdfReader(pdf_in_file)
          output = PdfWriter()
          #output.addPage(input_pdf.getPage(page))
          output.add_page(input_pdf.pages[page])
          with open(f"{self.PDF_FILES_PATH}/page-{page}.pdf", "wb") as outputStream:
              output.write(outputStream)
              pdf_list.append(f"page-{page}.pdf")
      return pdf_list

  def extract_text_from_pdf(self, pdf_list):
    txt_file_list = []
    i = 0
    for pdf_file in pdf_list:
      with open(os.path.join(self.PDF_FILES_PATH, pdf_file), "rb") as f:
        pdf = pdftotext.PDF(f)

      # Read all the text into one string
      pdf_text = "\n\n".join(pdf)

      # write text into file
      with open(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt", "a") as f:
        f.write(pdf_text)
      txt_file_list.append(f"{self.TXT_FILES_PATH}/page-{str(i)}.txt")
      i += 1
    self.text_list = txt_file_list
    return txt_file_list

  def highlight_icd_code(self, icd10_code_dict, w_ratio=False, match_threshold=30, coordinate=False, pdf_file_name=None, cords_file_name=None):
      pdf_file = fitz.open(pdf_file_name)
      # create file to write coordinate
      txt_output_file_name = open(f"{self.OUTPUT_FILES_PATH}/{cords_file_name}", "a")
      # add file header 
      txt_output_file_name.write("| Page | Found Code | Actual ICD10-Code | Code Line # | ICD 10 description | Matched Line| Matched Line # | confidence | ")

      def highlight_pdf(highlight, icd10_code, num_page):
        cords_list = []
        match_list = []
        sentence_list = []
        line_list = []
        keyword = ""

        for inst in highlight:
          # do the color coding
          keyword = self.get_keyword(icd10_code)
          if len(keyword) > 0:
            match_list, sentence_list, line_list = self.get_best_token_match(icd10_code, num_page, w_ratio, match_threshold)
            # highlight code if threshold is more than 30
            if match_list and match_list[0][1] >= match_threshold:
              highlight = page.add_highlight_annot(inst)
              highlight.set_colors(stroke=[0.66, 1, 0.07])  # light green
              highlight.update()
              highlight = page.search_for(icd10_code)
              cords_list.append(highlight)
            
        if cords_list:
          num_page = page_num + 1
          for idx, match_found in enumerate(match_list):
            if match_found[1] >= match_threshold:
              # | Page | Found Code | Actual ICD10-Code | Code Line # | ICD 10 description | Matched Line| Matched Line # | confidence | 
              if coordinate:
                code_cors_output = f"|Page-{num_page} | {icd10_code} | {reverse_code_pattern(icd10_code)} | {line_list} | {keyword if keyword else 'Not available'} | {match_found[0]}  | {sentence_list.index(match_found[0]) + 1} | {match_found[1]} | \n {cords_list}"
                txt_output_file_name.write("%s\n" % code_cors_output)
              else:
                code_cors_output = f"|Page-{num_page} | {icd10_code} | {reverse_code_pattern(icd10_code)} | {line_list} | {keyword if keyword else 'Not available'} | {match_found[0]}  | {sentence_list.index(match_found[0]) + 1} | {match_found[1]} |"
                txt_output_file_name.write("%s\n" % code_cors_output)

          txt_output_file_name.write("\n") # add extra line on every match code

      for page_num, page in enumerate(pdf_file):
        # highlight ICD-10 code
        if page_num in icd10_code_dict:
          for code in icd10_code_dict[page_num]:
            highlight = page.search_for(code)
            if len(highlight) == 0:
              alternate_code_list = self.get_opt_pattern(code)
              
              for alt_code in alternate_code_list:
                highlight = page.search_for(alt_code)
                # highlight pdf for option pattern
                highlight_pdf(highlight, alt_code, page_num)
            # highlight pdf for main pattern
            highlight_pdf(highlight, code, page_num)

      txt_output_file_name.close()

      pdf_output_file_name = f"{self.OUTPUT_FILES_PATH}/{pdf_file_name.split('/')[1].split('.')[0]}_output.pdf"
      pdf_file.save(pdf_output_file_name, garbage=4, deflate=True, clean=True)

      return pdf_output_file_name, cords_file_name

  def get_opt_pattern(self, icd_10_code):
    # create alternate pattern
    code_arr = icd_10_code.split(".")
    if len(code_arr) > 1:
      code1 = f"{code_arr[0]}. {code_arr[1]}"
      code2 = f"{code_arr[0]} .{code_arr[1]}"
      code3 = f"{code_arr[0]} . {code_arr[1]}"
      return [code1, code2, code3]
    else:
      return icd_10_code

  def search_icd_code(self, txt_list):
    pdf_page_vocab = {}
    for txt_file in txt_list:
      with open(txt_file, "r") as f:
        page_txt = f.read()

        # check the page that have line number instead of code
        index_page = False
        if re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", page_txt):
          index_page = True

        doc = self.nlp_code10(page_txt)
        code_list = []
        for ent in doc.ents:
          if index_page:
            # check the code contain letter "L"
            if re.search("(L[0-9]+)", ent.text):
              continue
            else:
              code_list.append(ent.text)
          else:
            code_list.append(ent.text)

        #code_list = [ent.text for ent in doc.ents if not re.search("(P[ ][0-9]+)(,\s)(L[0-9]+)", ent.text)]
        if len(code_list) != 0:
          page_number = int(txt_file.split("/")[1].split(".")[0].split("-")[1])
          pdf_page_vocab[page_number] = list(set(code_list)) 
          # print(f"Page[{txt_file.split('/')[1]}]: {code_list}")
    return pdf_page_vocab

  def get_keyword(self, p_code):
    keyword = ""
    # reverse code if required
    code = reverse_code_pattern(p_code)
    # get keyword from dataset
    keyword_list = list(self.code_df.loc[self.code_df["Code"] == code]["Keyword"])
    if len(keyword_list) > 0:
      keyword = keyword_list[0]
    return keyword

  def get_best_token_match(self, p_code, num_page, w_ratio, match_threshold):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = self.get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    sentence_list = get_sentence_list(self.text_list, num_page)
    # Step 4: get best match token ratio or wratio
    if w_ratio:
      match_list = [(sentence, fuzz.WRatio(keyword, sentence)) for sentence in sentence_list if fuzz.WRatio(keyword, sentence) > match_threshold]
    else:
      match_list = [(sentence, fuzz.token_set_ratio(keyword, sentence)) for sentence in sentence_list if fuzz.token_set_ratio(keyword, sentence) > match_threshold]
    # Step 5: get sentence line
    line_list = get_sentence_line(p_code, sentence_list)
    return sort_tuple(match_list), sentence_list, line_list

  def get_match_sentence_and_line_data(self, p_code, num_page):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = self.get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    sentence_list = get_sentence_list(self.text_list, num_page)
    # Step 4: get best 3 match ratio 
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    # Step 5: get sentence line
    line_list = get_sentence_line(p_code, sentence_list)
    return match_list, sentence_list, line_list

def get_sentence_line(p_code, sentence_list):
  line_list = [(line + 1) for line, sent in enumerate(sentence_list) if p_code in sent]
  return line_list

def get_sentence_list(text_list, num_page):
  with open(f"{text_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  return sentence_list

def reverse_code_pattern(p_code):
  orig_code = p_code

  # check for code contains space(" ")
  tmp_code = orig_code.split(" ")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"

  # check for code contains dot(".")
  tmp_code = p_code.split(".")
  if len(tmp_code) > 1:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  
  # check for code contains comma(",")
  tmp_code = p_code.split(",")
  if len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[1].strip()}"
  elif len(tmp_code) == 2:
    orig_code = f"{tmp_code[0].strip()}.{tmp_code[2].strip()}"

  # handle if the first char of code is missing
  alphabats = {"Z": "2", "B": "8", "O": "0", "S": "5", "l": "1", "G": "6", "o": "9", "i": "1"}
  for key, val in alphabats.items():
    # replcae char on 0 index
    if orig_code.find(val) == 0:
      #orig_code = orig_code.replace(val, key)
      orig_code = replacer(orig_code, key, 0)
    # replcae char on 1 index
    if orig_code.find(key) == 1:
      orig_code = replacer(orig_code, val, 1)
      # replcae char on 2 index
      if orig_code.find(key) == 2:
        orig_code = replacer(orig_code, val, 2)
      break

  return orig_code

def replacer(s, newstring, index, nofail=False):
  # raise an error if index is outside of the string
  if not nofail and index not in range(len(s)):
      raise ValueError("index outside given string")

  # if not erroring, but the index is still not in the correct range..
  if index < 0:  # add it to the beginning
      return newstring + s
  if index > len(s):  # add it to the end
      return s + newstring

  # insert the new string between "slices" of the original
  return s[:index] + newstring + s[index + 1:]

def sort_tuple(p_tup):
  return(sorted(p_tup, key = lambda x: x[1], reverse=True)) 

def create_directory(dir_name):
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

##Keyword Matching & Highlighting 

- Step 1 - Z87.5
- Step 2 - Personal history of complications of pregnancy, childbirth and the puerperium
- Step 3 - Page keyword
- Step 4 - calculate cosine similirity
- Step 5 - "Green" > 60% otherwise "Yellow"

In [3]:
def purge(file_path):
  for f in glob.glob(file_path):
    os.remove(f)

In [6]:
# Step-0: create highlighter instance
INPUT_PDF_FILES_PATH = "input_files"
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

highlighter = Highlighter(code_df)

In [7]:
%%time

for pdf_file in os.listdir(INPUT_PDF_FILES_PATH):
  pdf_file_name = f"{INPUT_PDF_FILES_PATH}/{pdf_file}"
  cords_file_name = f"{pdf_file_name.split('/')[1].split('.')[0]}_cords.txt"

  # Step-1: splitting pdf file
  pdf_list = highlighter.split_pdf(pdf_file_name)

  # Step-2: Extracting text from pdf
  txt_list = highlighter.extract_text_from_pdf(pdf_list)

  # Step-3: Searching ICD-10 code
  icd10_code_dict = highlighter.search_icd_code(txt_list)

  # Step-4: Highlighting ICD-10 code into pdf
  pdf_output_file, txt_output_file = highlighter.highlight_icd_code(icd10_code_dict,
                                                                    w_ratio=True,
                                                                    match_threshold=40,
                                                                    coordinate=False,
                                                                    pdf_file_name=pdf_file_name,
                                                                    cords_file_name=cords_file_name)
  print(f"File[{pdf_output_file}] is saved after highlighting ICD-10 code")
  print(f"Highlighted coordinates are saved into [{txt_output_file}] file.")

  # remove all pdf and text files
  purge("pdf-files/*.pdf")
  purge("txt-files/*.txt")
  pdf_list = []
  txt_list = []

File[output/APS_38600000R_final_output.pdf] is saved after highlighting ICD-10 code
Highlighted coordinates are saved into [APS_38600000R_final_cords.txt] file.
File[output/01_final_output.pdf] is saved after highlighting ICD-10 code
Highlighted coordinates are saved into [01_final_cords.txt] file.
CPU times: user 1min 42s, sys: 1.2 s, total: 1min 43s
Wall time: 1min 45s


In [4]:
!rm -rf input_files
!mkdir -p input_files
!rm -rf output
!mkdir -p output

In [5]:
purge("pdf-files/*.pdf")
purge("txt-files/*.txt")

In [8]:
!zip output.zip output/*.*

  adding: output/01_final_cords.txt (deflated 86%)
  adding: output/01_final_output.pdf (deflated 9%)
  adding: output/APS_38600000R_final_cords.txt (deflated 87%)
  adding: output/APS_38600000R_final_output.pdf (deflated 2%)


In [10]:
# Step-1: spliting pdf file
pdf_file_name = "APS_38600000R_final.pdf"
pdf_list = highlighter.split_pdf(pdf_file_name)

# Step-2: Extracting text from pdf
txt_list = highlighter.extract_text_from_pdf(pdf_list)

In [11]:
# Step-3: Searching ICD-10 code
page_code10_dict = highlighter.search_icd_code(txt_list)

In [12]:
code_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")

##Find keyword match

In [None]:
def get_best_match(text_file, keyword):
  with open(text_file) as f:
    lines = [line.rstrip('\n') for line in f]

    ratios = [fuzz.ratio(keyword, line) for line in lines]
    best_match = lines[ratios.index(max(ratios))]
    print(f"{lines.index(best_match) + 1}:{keyword}, {fuzz.partial_ratio(keyword, best_match)}: {best_match}")

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  lines = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
lines

In [None]:
def get_similarity_score(keyword, text_file):
  # load text file
  with open(text_file, "r") as f:
    my_text = f.read()

  # prepare key phrase
  key_phrase_list = []
  for textlist in my_text.split("\n"):
    for key_phrase in textlist.split(","):
      if len(key_phrase) > 0:
        key_phrase_list.append(key_phrase)
  # get max similarity score
  score_list = [SequenceMatcher(None, k_phrase, keyword).ratio() for k_phrase in key_phrase_list]
  max_score = max(score_list)
  # get index position
  index_pos = score_list.index(max_score)
  # get most similar phrase
  most_similar_phrase = key_phrase_list[index_pos]
  return max_score, most_similar_phrase

In [None]:
get_similarity_score("Snoring", txt_list[7])

(0.3333333333333333, ' octigraphy')

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  my_text = f.read()
for keyword in my_text.split(","):
  seq = SequenceMatcher(None, keyword, "Snoring")
  print(f"{round(seq.ratio(), 3)} : {keyword}")
  if round(seq.ratio(), 3) > .70:
    print(f"Max ratio found: {round(seq.ratio(), 3)} : {keyword}")

##Text clean up

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  my_text = f.read()

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(my_text)
sentences = [sentence.text for sentence in doc.sents]

In [None]:
sentences

In [None]:
def text_preprocess(text_file):
  sentence_list = []
  stopwords = ["is", "a"]
  doc = nlp(my_text)
  sentences = [sentence.text for sentence in doc.sents]
  for sent in sentences:
    clean_text = " ".join(sent.split())  # Remove extra spaces, tabs, and line breaks
    clean_text = re.sub(f"[{re.escape(punctuation)}]", "", clean_text) # Remove punctuation
    clean_text = re.sub(r"\b[0-9]+\b\s*", "", clean_text)     # Remove numbers
    clean_text = " ".join([w for w in clean_text.split() if not w.isdigit()]) # Remove digits= Side effect: removes extra spaces
    clean_text = " ".join([w for w in clean_text.split() if w.isalpha()]) # Remove non-alphabetic characters= Side effect: removes extra spaces
    clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", clean_text) # Remove all special characters and punctuation
    # Remove stopwords from a list
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if not t in stopwords]
    clean_text = " ".join(clean_tokens)
    # Remove short tokens
    tokens = clean_text.split()
    clean_tokens = [t for t in tokens if len(t) > 3]
    clean_text = " ".join(clean_tokens)
    # Remove repeated characters
    clean_text = re.sub(r'(.)\1{3,}',r'\1', clean_text)
    if len(clean_text) > 0:
      sentence_list.append(clean_text)
  return sentence_list

In [None]:
sentence_list = text_preprocess(my_text)
sentence_list

In [None]:
with open(f"{txt_list[7]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
sentence_list

In [None]:
def get_best_match(sentence_list, keyword):
  print("#"*10)
  print(f"Matching for : {keyword}")
  print()
  ratios = [fuzz.ratio(keyword, sentence) for sentence in sentence_list]
  best_match = sentence_list[ratios.index(max(ratios))]
  print(f"Best match: {fuzz.ratio(keyword, best_match)} | {best_match}")
  print(f"Before Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) - 1])} | {sentence_list[sentence_list.index(best_match) - 1]}")
  print(f"After Match: {fuzz.ratio(keyword, sentence_list[sentence_list.index(best_match) + 1])} | {sentence_list[sentence_list.index(best_match) + 1]}")
  print()

  p_ratios = [fuzz.partial_ratio(keyword, sentence) for sentence in sentence_list]
  p_best_match = sentence_list[p_ratios.index(max(p_ratios))]
  print(f"Partial Best match: {fuzz.partial_ratio(keyword, p_best_match)} | {p_best_match}")
  print(f"Partial Before Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) - 1])} | {sentence_list[sentence_list.index(p_best_match) - 1]}")
  print(f"Partial After Match: {fuzz.partial_ratio(keyword, sentence_list[sentence_list.index(p_best_match) + 1])} | {sentence_list[sentence_list.index(p_best_match) + 1]}")
  print()

In [None]:
keyword_list = [get_keyword(code) for code in clean_icd10_code]
keyword_list

In [None]:
[get_best_match(sentence_list, keyword) for keyword in keyword_list]

In [None]:
def get_best_match(sentence_list, keyword_list):
  for keyword in keyword_list:
    #for sentence in sentence_list:
    print("#"*10)
    print(f"Matching for : {keyword}")
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    for match_found in match_list:
      print(f"{match_found[0]} | {match_found[1]}")
    print()

In [None]:
get_best_match(sentence_list, keyword_list)
#process.extract(query, choices, scorer = fuzz.partial_ratio, limit = 2)

In [None]:
sentence_list.index("Moderate Obstructive Sleep Apnea .")

25

In [None]:
for idx, sent in enumerate(sentence_list):
  print(f"{idx}>{sent}")

##All Together

In [13]:
def get_keyword(p_code):
  keyword = ""
   # reverse code if required
  code = reverse_code_pattern(p_code)
  # get keyword from dataset
  keyword_list = list(code_df.loc[code_df["Code"] == code]["Keyword"])
  if len(keyword_list) > 0:
    keyword = keyword_list[0]
  return keyword

def get_best_match(code10_dict, txt_list, num_page):
  # Step 1: reverse code pattern
  clean_icd10_code = [reverse_code_pattern(code) for code in page_code10_dict[num_page]]
  # Step 2: fetch keyword based on code 
  keyword_list = [get_keyword(code) for code in clean_icd10_code]
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks

  # Step 4: get best match 
  for keyword in keyword_list:
    #for sentence in sentence_list:
    print("#"*10)
    print(f"Matching for : {keyword}")
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    print(match_list[0][1])
    for idx, match_found in enumerate(match_list):
      if match_found[1] > 40:
        print(f"{idx}> {match_found[0]} | {match_found[1]}")
    print()

In [None]:
def get_best_match(p_code, num_page):
    # Step 1: reverse code pattern
    reversed_icd_code = reverse_code_pattern(p_code)
    # Step 2: fetch keyword based on code 
    keyword = get_keyword(reversed_icd_code)
    # Step 3: prepare sentence list 
    with open(f"{txt_list[num_page]}", "r") as f:
      lines = [line.rstrip('\n') for line in f]
      sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
    # Step 4: get best 3 match ratio 
    match_list = process.extract(keyword, sentence_list, scorer = fuzz.ratio, limit = 3)
    return match_list, sentence_list

In [None]:
clean_icd10_code = [reverse_code_pattern(code) for code in page_code10_dict[7]]
clean_icd10_code

['G47.10', 'G47.31', 'R06.83', 'G47.39', 'G47.33', 'G47.8', 'G47.34', 'R06.3']

In [None]:
match_list, sentence_list = get_best_match("G47.10", 7)
match_list

[('(central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10)', 55),
 ('Diplomate in ABPN - Sleep Medicine', 45),
 ('Hypoxemia', 42)]

Finding for `Hypersomnia, unspecified`:
44 > Hypoxemia
65 > (central sleep apnea (G47.31) CoHypersomnia, unspecified (G47.10)

In [14]:
with open(f"{txt_list[21]}", "r") as f:
  lines = [line.rstrip('\n') for line in f]
  sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks

In [15]:
def sort_tuple(tup):
  return(sorted(tup, key = lambda x: x[1], reverse=True)) 

def get_best_token_match(p_code, num_page):
  # Step 1: reverse code pattern
  reversed_icd_code = reverse_code_pattern(p_code)
  # Step 2: fetch keyword based on code 
  keyword = get_keyword(reversed_icd_code)
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  # Step 4: get best token match ratio 
  #match_list = [fuzz.token_set_ratio(keyword, sentence) for sentence in sentence_list]
  print(f"Finding for `{keyword}`:")
  match_list = []
  for sentence in sentence_list:
    token_ratio = fuzz.token_set_ratio(keyword, sentence)
    if token_ratio >= 45:
      print(f"{p_code}: {token_ratio} > {sentence}")
      match_list.extend((sentence, token_ratio))
  print()
  #return match_list

In [None]:
[code for code in page_code10_dict[21]]

['Z78.9',
 '268.25',
 'Z00.00',
 'S93.411A',
 'R50.9',
 '287.5',
 '278.9',
 'R03',
 'R10.32',
 'K46.9',
 'D69.6',
 'Z68.25',
 'R05',
 'J01.00',
 'R53.83',
 'L73.9',
 'E78.3',
 '212.5',
 'M54.5']

In [None]:
[(code, get_keyword(code)) for code in page_code10_dict[21]]

[('S93.411A', 'Sprain of calcaneofibular ligament of right ankle, init'),
 ('J01.00', 'Acute maxillary sinusitis, unspecified'),
 ('Z00.00', 'Encntr for general adult medical exam w/o abnormal findings'),
 ('287.5', 'Personal history of comp of preg, chldbrth and the puerp'),
 ('E78.3', 'Hyperchylomicronemia'),
 ('R10.32', 'Left lower quadrant pain'),
 ('M54.5', 'Low back pain'),
 ('212.5', 'Encounter for screening for malignant neoplasm of prostate'),
 ('R03', 'Abnormal blood-pressure reading, without diagnosis'),
 ('Z68.25', 'Body mass index [BMI] 25.0-25.9, adult'),
 ('R50.9', 'Fever, unspecified'),
 ('R53.83', 'Other fatigue'),
 ('Z78.9', 'Other specified health status'),
 ('K46.9', 'Unspecified abdominal hernia without obstruction or gangrene'),
 ('D69.6', 'Thrombocytopenia, unspecified'),
 ('L73.9', 'Follicular disorder, unspecified'),
 ('278.9', 'Other specified health status'),
 ('R05', 'Cough'),
 ('268.25', 'Body mass index [BMI] 25.0-25.9, adult')]

In [None]:
[get_best_token_match(code, 21) for code in page_code10_dict[21]]

In [21]:
def get_best_token_match(p_code, num_page):
  # Step 1: reverse code pattern
  reversed_icd_code = reverse_code_pattern(p_code)
  # Step 2: fetch keyword based on code 
  keyword = get_keyword(reversed_icd_code)
  # Step 3: prepare sentence list 
  with open(f"{txt_list[num_page]}", "r") as f:
    lines = [line.rstrip('\n') for line in f]
    sentence_list = [" ".join(line.split()) for line in lines] # Remove extra spaces, tabs, and line breaks
  # Step 4: get best token match ratio 
  match_list = [(sentence, fuzz.WRatio(keyword, sentence)) for sentence in sentence_list if fuzz.token_set_ratio(keyword, sentence) > 40]
  return sort_tuple(match_list)

In [22]:
match_list = [get_best_token_match(code, 21) for code in page_code10_dict[21]]

In [23]:
match_list

[[('Diagnosis Encntr for general adult medical exam w/o abnormal findings (Z00.00), Screening for prostate cancer (212.5)',
   90),
  ('Diagnosis Routine general medical examination at a health care facility (Z00.00), Hypertriglyceridemia, sporadic (E78.3)',
   86)],
 [('Diagnosis Contact with and (suspected) exposureto other viral communicable diseases, Cough (R05)',
   86),
  ('Diagnosis Contact with/suspected exposure to COVID-19, Cough (R05), Fatigue (R53.83), Overweight (BMI 25.0 - 29.9), Body mass index (BMI)',
   57),
  ('Encounter 7 Date 12/12/2019', 56)],
 [('Diagnosis Contact with and (suspected) exposureto COVID-19, Body mass index (BMI) of 25.0-25.9 in adult (Z68.25), Non-smoker (278.9)',
   86),
  ('Diagnosis Contact with/suspected exposure to COVID-19, Cough (R05), Fatigue (R53.83), Overweight (BMI 25.0 - 29.9), Body mass index (BMI)',
   86),
  ('Diagnosis Folliculitis (L73.9), Non-smoker (Z78.9), Overweight (BMI 25.0 - 29.9), Body mass index (BMI) of 26.0-25.9 in adult 

In [None]:
match_list = get_best_token_match("R50.9", 21)
match_list

[('Diagnosis Fever (R50.9)', 47),
 ('Diagnosis Fever (R50.9), Sinusitis, acute, maxillary', 45)]

In [16]:
fuzz.token_set_ratio(get_keyword("E78.3"), "Diagnosis Hypertriglyceridemia, sporadic (E78.3)")

34

In [17]:
fuzz.token_set_ratio(get_keyword("R50.9"), "Diagnosis Fever (R50.9), Sinusitis, acute, maxillary")

45

In [18]:
fuzz.token_set_ratio("Fever, unspecified", "Diagnosis Fever (R50.9), Sinusitis, acute, maxillary")

45

In [19]:
fuzz.token_set_ratio(get_keyword("R50.9"), "Diagnosis Fever (R50.9)")

47

In [20]:
fuzz.WRatio(get_keyword("E78.3"), "Diagnosis Hypertriglyceridemia, sporadic (E78.3)")

50

##Fuzzy

In [None]:
!pip install fuzzywuzzy

In [None]:
from fuzzywuzzy import fuzz

In [None]:
str1 = 'This document printed distributed unauthorized purpose whatsoever'
str2 = 'Other disorders of amniotic fluid and membranes'

ratio = fuzz.ratio(str1, str2)
partial_ratio = fuzz.partial_ratio(str1, str2)

print(ratio)
print(partial_ratio)

30
28


In [None]:
for keyword in sentence_list:
  ratio = fuzz.partial_ratio(keyword, "Snoring")
  print(f"{round(ratio, 3)} : {keyword}")

43 : Redacted person nine Male North Texas Neuroscience Cente
43 : Sunil Mathews
43 : NeTessc NerthyTaxas Neurmacienee sinap Gantmr Diplomate American Board Psychiatry Neurolagy ABPN Diplomate ABPN Clinical Neurophysiology EMGEEG
29 : Diplomate ABPN Sleep Medicine INTERPRETATION SLEEP STUDY
14 : Chart Number
29 : Height Male Weight Referring Physiclan Sunil Mathews Epworth Score Neck Size Clinical Reason Study Suspected Sleep Apnea Study Type WatchPAT Study Date MethodsTechnique standards American Acadenty Sleep
71 : medicine were followd monitoring patient
86 : Home Sleep Test includedrerordingof peripheral arterial tone oxygen saturation heart rata octigraphy body position snoring
29 : RBland were caleuleted from this data
14 : Tatal slecp awake time NREM sleap were assassad
29 : Minimal oxygen saturation Moderate Obstructive Sleep Apnea
100 : Severe Related Respiratory Events Hypoxemia Loud Disruptive Snoring glalsslelelselelslelelssleles Obesity Reduced Sleep Ffficiancy minutes hur

In [None]:
def get_best_match(text_file, keyword):
  with open(text_file) as f:
    names = [line.rstrip('\n') for line in f]
    ratios = [fuzz.ratio(keyword, name) for name in names]
    best_match = names[ratios.index(max(ratios))]
    print(fuzz.partial_ratio(name_to_match,best_match))
    print(best_match)
  return best_match

In [None]:
get_best_match(txt_list[7], "Primary central sleep apnea")

34
               APS.Extract® All Rights Reserved


'               APS.Extract® All Rights Reserved'

In [None]:
with open(txt_list[7]) as f:
    names = [line.rstrip('\n') for line in f]
#for names in lines:
#    print (names[0])
    icdkeywords = ['Obstructive sleep apnea','Primary central sleep apnea']
    #name_to_match = 'Sleep related hypoventilation'
    #name_to_match = 'ZZZZZ'
    #print ("Names:", names)
    #print ("name to match:", name_to_match)
    #ratios = [fuzz.ratio(name_to_match, names)]
    for name_to_match in icdkeywords:
        ratios = [fuzz.ratio(name_to_match, name) for name in names]
        print(ratios)
        print(max(ratios))
        best_match = names[ratios.index(max(ratios))]
        print(fuzz.partial_ratio(name_to_match, best_match))
        print(fuzz.token_set_ratio(name_to_match, best_match))
        print(best_match)

[3, 0, 19, 1, 9, 0, 0, 3, 3, 4, 4, 13, 6, 11, 3, 3, 1, 2, 7, 15, 10, 1, 15, 1, 9, 23, 1, 3, 6, 5, 3, 12, 1, 7, 13, 4, 15, 12, 2, 0, 8, 11, 19, 13, 8, 19, 5, 12, 7, 17, 2, 22, 10, 13, 15, 10, 5, 8, 4, 5, 5, 4, 2, 3, 20, 1, 20]
23
91
100
                                                                                                    Moderate Obstructive Sleep Apnea                       .
[3, 0, 21, 1, 7, 0, 0, 5, 4, 4, 3, 9, 10, 13, 5, 6, 2, 6, 11, 14, 15, 2, 19, 2, 14, 16, 2, 5, 10, 6, 3, 4, 1, 4, 13, 6, 11, 11, 6, 1, 7, 7, 13, 14, 7, 17, 13, 15, 9, 17, 6, 15, 15, 14, 20, 11, 7, 9, 3, 7, 6, 4, 2, 5, 27, 4, 22]
27
34
34
               APS.Extract® All Rights Reserved


In [None]:
# Remove extra spaces, tabs, and line breaks
clean_text = " ".join(my_text.split())
clean_text

In [None]:
# Remove punctuation
clean_text = re.sub(f"[{re.escape(punctuation)}]", "", clean_text)
clean_text

In [None]:
# Remove numbers
clean_text = re.sub(r"\b[0-9]+\b\s*", "", clean_text)
clean_text

In [None]:
# Remove digits
clean_text = " ".join([w for w in clean_text.split() if not w.isdigit()]) # Side effect: removes extra spaces
clean_text

In [None]:
# Remove non-alphabetic characters
clean_text = " ".join([w for w in clean_text.split() if w.isalpha()]) # Side effect: removes extra spaces
clean_text

In [None]:
# Remove all special characters and punctuation
clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", clean_text)
clean_text

In [None]:
# Remove stopwords from a list
stopwords = ["is", "a"]
tokens = clean_text.split()
clean_tokens = [t for t in tokens if not t in stopwords]
clean_text = " ".join(clean_tokens)

In [None]:
clean_text

In [None]:
# Remove short tokens
tokens = clean_text.split()
clean_tokens = [t for t in tokens if len(t) > 1]
clean_text = " ".join(clean_tokens)
clean_text

In [None]:
# Remove repeated characters
clean_text = re.sub(r'(.)\1{3,}',r'\1', clean_text)
clean_text