In [1]:
from similarity_measures import preprocessing, cosine

from pdfstructure.hierarchy.parser import HierarchyParser
from pdfstructure.source import FileSource
from pdfstructure.printer import JsonFilePrinter
import json
import pathlib

import numpy as np
import re

In [2]:
def text_on_page(dict_var, id_json, list_res, page):
    if type(dict_var) is dict:
        for k, v in dict_var.items():
            if k == id_json and v == page:
                if v > page: return list_res
                list_res.append(dict_var["text"])
            elif isinstance(v, dict):
                text_on_page(v, id_json, list_res, page)   
            elif isinstance(v, list):
                for item in v:
                    text_on_page(item, id_json, list_res, page)
    return list_res


def get_page(data, page):
    lines = []
    for chunk in data["elements"]:
        lines.extend(text_on_page(chunk, "page", [], page))             
    return lines

In [3]:
file = 'pdfs/Nurse.pdf'
start = 25
end = 50

In [4]:
# TODO
parser = HierarchyParser()
source = FileSource(file, page_numbers=list(range(start-1, end)))
document = parser.parse_pdf(source)
printer = JsonFilePrinter()
file_path = pathlib.Path('pdf.json')
printer.print(document, file_path=str(file_path.absolute()))

'c:\\Users\\james\\Documents\\Cornell\\2021SP\\CS4300\\Project\\CS4300_microGoogle\\pdf.json'

In [5]:
# TODO
with open('pdf.json') as file:
    data = json.load(file)
file.close()
pages = {i + start : get_page(data,i) for i in range(0, end-start+1)}

In [6]:
pages[25]

['02doenges-02  2/2/04  11:56 AM  Page 7',
 'Table 2–1. NURSING DIAGNOSES',
 'Infection, risk for 307–310\nInjury, risk for\nInjury, risk for perioperative positioning 313–316\nIntracranial Adaptive Capacity, decreased 316–319',
 '310–313',
 'Knowledge, deficient [Learning Need] (specify)\n*Knowledge (specify), readiness for enhanced 323–325',
 '319–323',
 'Loneliness, risk for 326–328',
 'Memory, impaired 328–331\nMobility, impaired bed 331–333\nMobility, impaired physical 333–337\nMobility, impaired wheelchair 337–339',
 '(cid:2)Nausea 339–343',
 'Noncompliance [Adherence, ineffective] [specify] 343–347\nNutrition: imbalanced, less than body requirements\nNutrition: imbalanced, more than body requirements 352–355\nNutrition: imbalanced, risk for more than body requirements',
 '347–352',
 '356–358',
 '*Nutrition, readiness for enhanced 359–362',
 'Oral Mucous Membrane, impaired 362–365',
 '374–377',
 'Pain, acute 365–369\nPain, chronic 370–374\nParental Role Conflict\nParenting, impai

In [40]:
def get_formatted_docs(pages, max_paragraphs = 0):
    """
    Format the pages extracted from pdf, by removing excessive whitespaces 
    but preserving punctuations, capital cases, etc.

    [pages]: Dict{page_num: List[paragraph_text_string]]
    [max_paragraphs]: maximum number of paragraphs allowed per page; if actual number of paragraphs 
                      exceed this number, then merge paragraphs to improve performance.
                      if = 0, then no merging of paragraphs
    return:
        [formatted_docs]: Dict{parapgrah_idx: paragraph_text_string}
        [paragraph_page_idxs]: Dict{paragraph_idx: page_num}
    """
    formatted_docs = {}
    paragraph_page_idxs = {}
    paragraphs = []
    for page_num in pages.keys():
        arr = pages[page_num]
        arr = [re.sub('-[\n\r\t\s]+', '', s) for s in arr] # words broken by line break
        arr = [re.sub('[\n\r\t\s]+', ' ', s) for s in arr] # remove line break, tabs, whitespaces
        if max_paragraphs > 0 and max_paragraphs < len(arr):
            # TODO better merging
            merged = '\n'.join(arr)
            arr = merged.split('\n', maxsplit = max_paragraphs-1)
            arr = [re.sub('[\n\r\t\s]+', ' ', s) for s in arr]
        paragraphs += [(page_num, s) for s in arr]
    for i in range(len(paragraphs)):
        formatted_docs[i] = paragraphs[i][1]
        paragraph_page_idxs[i] = paragraphs[i][0]
    return (formatted_docs, paragraph_page_idxs)

In [60]:
docs, idx = get_formatted_docs(pages, max_paragraphs = 5)

In [61]:
np.argmin([len(d) for d in docs.values()])

38

In [64]:
docs[33]

'Interventions are designed to specify the action of the nurse, the client, and/or SOs. Interventions need to promote the client’s movement toward health/independence in addition to achievement of physiologic stability. This requires involvement of the client in his or her own care, including participation in decisions about care activities and projected outcomes.'

In [7]:
(formatted_docs, paragraph_page_idx) = preprocessing.get_formatted_docs(pages)
preprocessed_docs = preprocessing.get_preprocessed_docs(formatted_docs)
tfidf_vectorizer = cosine.get_tfidf_vectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(list(preprocessed_docs.values())).toarray()

In [8]:
query = 'many years ago the nursing profession'
q = cosine.get_query_vector(query, tfidf_vectorizer)
cos_sims = cosine.get_cosine_sim(q, tfidf_matrix)
(rankings, scores) = cosine.get_rankings(cos_sims)
cosine.display_rankings(rankings, scores, formatted_docs, paragraph_page_idx)

1,   cosine score: 0.7692868436690928,   page: 41
Years in relationship:


2,   cosine score: 0.49085131502080787,   page: 27
Table 2–1. NURSING DIAGNOSES


3,   cosine score: 0.49085131502080787,   page: 25
Table 2–1. NURSING DIAGNOSES


4,   cosine score: 0.43523447746603067,   page: 28
The key to accurate diagnosis is collection and analysis of data. In Chapter 3, the NDs have been categorized into divisions (Diagnostic Divisions: Nursing Diagnoses Organized According to a Nursing Focus, Section 2) and an assessment tool designed to assist the nurse to identify appropriate NDs as the data are collected. Nurses may feel at risk in committing themselves to documenting a nursing diagnosis for fear they might be wrong. However, unlike medical diagnoses, NDs can change as the client progresses through various stages of illness/maladaptation to resolution of the condition/situation.


5,   cosine score: 0.41722324588943355,   page: 26
Table 2–1. NURSING DIAGNOSES (CONTINUED)


6,   cosine