In [1]:
from similarity_measures import preprocessing, cosine

from pdfstructure.hierarchy.parser import HierarchyParser
from pdfstructure.source import FileSource
from pdfstructure.printer import JsonFilePrinter
import json
import pathlib

import numpy as np
import re

In [2]:
def text_on_page(dict_var, id_json, list_res, page):
    if type(dict_var) is dict:
        for k, v in dict_var.items():
            if k == id_json and v == page:
                if v > page: return list_res
                list_res.append(dict_var["text"])
            elif isinstance(v, dict):
                text_on_page(v, id_json, list_res, page)   
            elif isinstance(v, list):
                for item in v:
                    text_on_page(item, id_json, list_res, page)
    return list_res


def get_page(data, page):
    lines = []
    for chunk in data["elements"]:
        lines.extend(text_on_page(chunk, "page", [], page))             
    return lines

In [3]:
file = 'pdfs/Nurse.pdf'
start = 30
end = 77

In [5]:
parser = HierarchyParser()
source = FileSource(file, page_numbers=list(range(start-1, end)))
document = parser.parse_pdf(source)
printer = JsonFilePrinter()
file_path = pathlib.Path('pdf.json')
printer.print(document, file_path=str(file_path.absolute()))

'c:\\Users\\james\\Documents\\Cornell\\2021SP\\CS4300\\Project\\CS4300_microGoogle\\pdf.json'

In [6]:
with open('pdf.json') as file:
    data = json.load(file)
file.close()
pages = {i + start : get_page(data,i) for i in range(0, end-start+1)}

In [7]:
pages[77]

['03doenges-03  2/2/04  11:57 AM  Page 59',
 'EXAMPLE 2. SAMPLE OF FOCUS CHARTING®\nFOR PROTOTYPE PLAN OF CARE (CONTINUED)',
 'D (cid:2) DATA A (cid:2) ACTION',
 'R (cid:2) RESPONSE',
 'DATE',
 'TIME',
 'FOCUS®',
 'The  following  is  an  example  of documentation  of a  client\nneed/concern that currently does not require identification as a\nclient  problem  (nursing  diagnosis)  or  inclusion  in  the  plan  of\ncare  and  therefore  is  not  easily  documented  in  the  SOAP\nformat:\n6/29/00',
 '2020',
 'Gastric \ndistress',
 'R: R. S. more confident in',
 'demonstration, performed\nactivity correctly without\nhesitation or hand tremors.\nHe explained steps of proce-\ndure and reasons for actions\nto wife.\nCouple identified resources \nto contact if questions/\nproblems arise.',
 'D: Awakened from light sleep\nby “indigestion/burning\nsensation.”\nPlaces hand over epigastric\narea. Skin warm/dry, color\npink, vital signs unchanged.',
 'A: Given Mylanta 30 ml PO.',
 'Head of bed e

In [9]:
def get_formatted_docs(pages, max_paragraphs = 0):
    """
    Format the pages extracted from pdf, by removing excessive whitespaces 
    but preserving punctuations, capital cases, etc.

    [pages]: Dict{page_num: List[paragraph_text_string]]
    [max_paragraphs]: maximum number of paragraphs allowed per page; if actual number of paragraphs 
                      exceed this number, then merge paragraphs to improve performance.
                      if = 0, then no merging of paragraphs
    return:
        [formatted_docs]: Dict{parapgrah_idx: paragraph_text_string}
        [paragraph_page_idxs]: Dict{paragraph_idx: page_num}
    """
    formatted_docs = {}
    paragraph_page_idxs = {}
    paragraphs = []
    for page_num in pages.keys():
        arr = pages[page_num]
        arr = [re.sub('-[\n\r\t\s]+', '', s) for s in arr] # words broken by line break
        arr = [re.sub('[\n\r\t\s]+', ' ', s) for s in arr] # remove line break, tabs, whitespaces
        if max_paragraphs > 0 and max_paragraphs < len(arr):
            arr = ' '.join(arr).split()
            k = int(len(arr)/max_paragraphs)
            if k < 1:
                arr = [' '.join(arr)]
            else:
                arr = [' '.join(arr[i:i+k]) for i in range(0, len(arr), k)]
        paragraphs += [(page_num, s) for s in arr]
    for i in range(len(paragraphs)):
        formatted_docs[i] = paragraphs[i][1]
        paragraph_page_idxs[i] = paragraphs[i][0]
    return (formatted_docs, paragraph_page_idxs)

In [8]:
(formatted_docs, paragraph_page_idx) = preprocessing.get_formatted_docs(pages)
preprocessed_docs = preprocessing.get_preprocessed_docs(formatted_docs)
tfidf_vectorizer = cosine.get_tfidf_vectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(list(preprocessed_docs.values())).toarray()

In [11]:
query = 'blurry vision'
q = cosine.get_query_vector(query, tfidf_vectorizer)
cos_sims = cosine.get_cosine_sim(q, tfidf_matrix)
(rankings, scores) = cosine.get_rankings(cos_sims)
cosine.display_rankings(rankings, scores, formatted_docs, paragraph_page_idx)

1,   cosine score: 4.026041911102011,   page: 61
Tingling/numbness: feet, once or twice a week (as noted) Eyes: Vision loss, farsighted, “Seems a little blurry now”


2,   cosine score: 4.009762252212964,   page: 39
Impaired vision/hearing:


3,   cosine score: 3.713524163659459,   page: 37
Eyes: Vision loss:


4,   cosine score: 2.572486048838581,   page: 47
Back problems: Changes in moles: Impaired vision: Prosthesis:


5,   cosine score: 1.0355248425920365,   page: 62
Allergies: 0 Sexually transmitted disease: none Fractures/dislocations: L clavicle, 1966, fell getting off tractor Arthritis/unstable joints: “I think I’ve got some in my knees.” Back problems: occ. lower back pain Vision impaired: requires glasses for reading Hearing impaired: slightly (R), compensates by turning “good


6,   cosine score: 0.0,   page: 45
Memory: Immediate: Comprehension: Thought processes (assessed through speech): Patterns of


7,   cosine score: 0.0,   page: 45
Person:


8,   cosine score: 0.0,   p

In [None]:
##### SVD ####

In [10]:
(formatted_docs, paragraph_page_idx) = preprocessing.get_formatted_docs(pages)
preprocessed_docs = preprocessing.get_preprocessed_docs(formatted_docs)
tfidf_vectorizer = cosine.get_tfidf_vectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(list(preprocessed_docs.values())).toarray()
(U, s, Vh) = cosine.get_svd(tfidf_matrix)

In [11]:
query = 'blurry vision'
q = cosine.get_query_vector(query, tfidf_vectorizer)
cos_sims = cosine.get_cosine_sim(q, U, s, Vh)
(rankings, scores) = cosine.get_rankings(cos_sims)
cosine.display_rankings(rankings, scores, formatted_docs, paragraph_page_idx)

1,   cosine score: 3.4758427315734775,   page: 37
Eyes: Vision loss:


2,   cosine score: 3.0439042754372907,   page: 61
Tingling/numbness: feet, once or twice a week (as noted) Eyes: Vision loss, farsighted, “Seems a little blurry now”


3,   cosine score: 2.6498260024222255,   page: 39
Impaired vision/hearing:


4,   cosine score: 2.222298445618994,   page: 47
Back problems: Changes in moles: Impaired vision: Prosthesis:


5,   cosine score: 1.11716860303346,   page: 62
Allergies: 0 Sexually transmitted disease: none Fractures/dislocations: L clavicle, 1966, fell getting off tractor Arthritis/unstable joints: “I think I’ve got some in my knees.” Back problems: occ. lower back pain Vision impaired: requires glasses for reading Hearing impaired: slightly (R), compensates by turning “good


6,   cosine score: 0.5087887332715003,   page: 39
When:


7,   cosine score: 0.5032480051211793,   page: 54
N U R S E ’ S P O C K E T G U I D E


8,   cosine score: 0.4235593654943664,   page: 52
263