In [6]:
file = './Nurse.pdf'

In [7]:
import pandas as pd
import numpy as np
import pdftotext
import re
import string
import matplotlib.pyplot as plt

In [8]:
import pdfplumber as pdf

In [9]:
with pdf.open(file) as raw:
    pdf_as_text = pdftotext.PDF(raw)

AttributeError: 'PDF' object has no attribute 'read'

In [10]:
with open(file, "rb") as f:
    pdf = pdftotext.PDF(f)

In [11]:
text_file_out = "\n\n".join(pdf)

In [12]:
file1 = open("pdftotext_result.txt","w")

In [13]:
file1.writelines(text_file_out)

In [14]:
def clean_string(text):
    '''
    Pre-process text
    input: string
    output: string
    '''
    # TODO better cleaning/pre-processing
    sub = ''
    text = text.lower()
    text = re.sub(',', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('\[.*?\]', sub, text) #brackets
    text = re.sub('[%s]' % re.escape(string.punctuation), sub, text) #punctions
    text = re.sub('\w*\d\w*', sub, text) #digits
    text = re.sub('[’’“”…]', sub, text) #quotes
    text = re.sub('\n', ' ', text) #newlines
    text = re.sub('♪', sub, text) #symbols
    text = re.sub('–', sub, text) #dashes
    return text

In [15]:
def pdf_to_listOfDicts(pdf):
    result = []
    for ind,i in enumerate(pdf):
        cleaned_string =  clean_string(i)
        result.append({'page':ind,'text':cleaned_string,'toks':list(set(cleaned_string.split()))})
    return result

In [18]:
pdf[0]

''

In [16]:
listOfDicts = pdf_to_listOfDicts(pdf)

In [17]:
listOfDicts

[{'page': 0, 'text': '', 'toks': []},
 {'page': 1,
  'text': '    gordons functional health                      patterns through  health perception health management pattern health maintenance  ineffective  therapeutic regimen effective management  therapeutic regimen ineffective management  therapeutic regimen readiness for enhanced management  therapeutic regimen family  ineffective management  therapeutic regimen community  ineffective management  noncompliance specify  health seeking behaviors specify  energy field  disturbed  falls  risk for  infection  risk for  injury trauma  risk for  protection  ineffective  poisoning  risk for  suffocation  risk for  perioperative positioning injury  risk for  sudden infant death syndrome  nutritional metabolic pattern nutrition more than body requirements  imbalanced  nutrition more than body requirements  risk for imbalanced  nutrition less than body requirements  imbalanced  nutrition  readiness for enhanced  breastfeeding  ineffective  b

In [10]:
pdf[0]

''

# Q1 Write a function to construct the inverted index (Code Completion)

In [59]:
def build_inverted_index(data):
    result = {}
    for ind, msg in enumerate(data):
        for token in set(msg['toks']):
            msg_count = msg['toks'].count(token)
            if token not in result.keys():
                result[token] = [(ind, msg_count)]
            else:
                result[token].append((ind, msg_count))
    return result

In [62]:
inv_idx = build_inverted_index(listOfDicts)

# Q2 Using the inverted index for boolean search

In [63]:
def boolean_search(query_word,excluded_word, inverted_index):
    return list(set(np.vstack(inverted_index[query_word.lower()])[:,0]).difference(set(np.vstack(inverted_index[excluded_word.lower()])[:,0])))


In [64]:
boolean_search("community","behaviors",inv_idx)

[3,
 6,
 526,
 15,
 19,
 532,
 533,
 22,
 534,
 536,
 25,
 538,
 544,
 550,
 41,
 563,
 54,
 567,
 56,
 570,
 574,
 63,
 102,
 121,
 131,
 134,
 144,
 149,
 169,
 179,
 180,
 181,
 182,
 183,
 213,
 725,
 221,
 233,
 237,
 753,
 756,
 245,
 757,
 761,
 763,
 255,
 770,
 783,
 279,
 287,
 291,
 294,
 300,
 317,
 326,
 327,
 329,
 339,
 342,
 344,
 355,
 356,
 360,
 368,
 372,
 375,
 378,
 382,
 389,
 390,
 393,
 398,
 401,
 412,
 413,
 414,
 420,
 439,
 462,
 466,
 474,
 477,
 503]

# Q3 Compute IDF *using* the inverted index (Code Completion)

In [66]:
len(listOfDicts)

788

In [65]:
def compute_idf(inv_idx, n_docs, min_df=15, max_df_ratio=0.90):
    id_frequencies = {}
    for term, documents in inv_idx.items():
        ratio = float(len(documents))/n_docs
        if ratio > max_df_ratio or min_df > len(documents):
            continue
        else:
            id_frequencies[term] = np.log2(n_docs/(len(documents)+1.))
    return id_frequencies

In [68]:
idf_dict = compute_idf(inv_idx, len(listOfDicts))

In [70]:
idf_dict

{'diarrhea': 4.229734396677616,
 'allergy': 5.622051819456376,
 'failure': 3.229734396677616,
 'bowel': 3.8146968973987723,
 'behaviors': 1.7765617685120012,
 'infection': 2.8018728570411886,
 'hyperthermia': 4.264499814838293,
 'volume': 2.85386749467945,
 'syndrome': 2.8018728570411886,
 'effective': 3.452126818014064,
 'more': 2.85386749467945,
 'community': 2.840692105931717,
 'body': 1.7640708243288041,
 'pattern': 2.345927414182139,
 'response': 1.5776577000979228,
 'ineffective': 2.0221389772692486,
 'for': 0.16057237217022044,
 'poisoning': 5.03708931873522,
 'integrity': 3.1301987231267017,
 'seeking': 4.921612101315284,
 'energy': 3.2822018165717517,
 'm': 5.5345889782060365,
 'trauma': 2.764070824328804,
 'falls': 5.452126818014064,
 'constipation': 4.814696897398772,
 'oral': 3.764070824328804,
 'from': 1.7955033321654612,
 'deficient': 2.5135273626782073,
 'patterns': 2.85386749467945,
 'imbalanced': 3.4125984538274268,
 'aspiration': 5.162620200819079,
 'temperature': 3.1

# Q4 Compute the norm of each document using the inverted index (Code Completion)

In [71]:
def compute_doc_norms(index, idf, n_docs):
    eq = lambda i, freq: np.square(i * np.array(freq))
    doc_norms = np.zeros(n_docs)
    for term, i in idf.items():
        doc, frequency = zip(*index[term])
        val = eq(i, frequency)
        for j in range(len(doc)):
            doc_norms[doc[j]] += val[j]
    doc_norms = np.sqrt(doc_norms)
    return doc_norms

In [72]:
doc_norms = compute_doc_norms(inv_idx, idf_dict, len(listOfDicts))

In [73]:
doc_norms

array([ 0.        , 33.3333087 , 35.61706138, 31.11514866,  6.403369  ,
        0.        , 11.30612901, 31.02152288,  3.17619844,  0.        ,
       28.0885967 ,  0.        , 21.94840427, 12.3155842 , 34.36123748,
       39.11814645, 36.8667065 , 27.25129961, 37.56636341, 42.16997651,
       35.2782095 , 35.04823904, 30.70002214, 32.87852083, 29.12498866,
       31.07468315, 27.49071422, 36.41561136, 28.39479681, 33.3363959 ,
       39.4528242 , 23.5957079 , 20.99415792, 28.03361044, 32.52631257,
       27.34337987, 29.38907402, 29.77458614, 30.88185836, 23.21575254,
       27.23452792, 33.73640306, 17.88203527, 30.11605016, 31.93596956,
       12.33954983, 33.56469393, 30.14042616, 32.33005938, 21.14318147,
       33.48920532, 31.09052699, 31.70409145, 33.86252973, 29.32114854,
       22.49270032, 32.26718688, 34.57493499, 35.25997382, 29.5155821 ,
       34.43632382, 32.64475723, 35.68008155, 33.09568081, 31.20497474,
       32.62462288, 34.02576297, 38.57842079, 34.16396371, 34.87

# Q5 Find the most similar messages to the quotes (Code Completion)

In [82]:
from nltk.tokenize import TreebankWordTokenizer
treebank_tokenizer = TreebankWordTokenizer()

In [83]:
def index_search(query, index, idf, doc_norms, tokenizer=treebank_tokenizer):
    tokens = tokenizer.tokenize(query.lower())
    q_term_freq = {token: tokens.count(token) for token in set(tokens)}
    norm = 0
    for t, f in q_term_freq.items():
        if t in idf.keys():
            norm = norm+np.square(f*idf[t])
    norm = np.sqrt(norm)
    
    results = np.zeros((np.shape(doc_norms)))
    if norm != 0:
        for t,f in q_term_freq.items():
            if t in idf.keys():
                ID, freq = zip(*index[t])
                num = f * idf[t]**2 * np.array(freq)
                for ind, d in enumerate(ID):
                    results[d] += num[ind]
        results = [r/(norm*doc_norms[ind]) if (r!=0 and doc_norms[ind] != 0) else 0 
                   for ind, r in enumerate(results)]
        results = sorted(list(zip(results, range(len(results)))), key= lambda l:l[0], reverse=True)
        return results
    
    else:
        return list(zip(results, range(len(results))))

In [126]:
query = "many years ago the nursing profession"

In [127]:
treebank_tokenizer.tokenize(query.lower())

['many', 'years', 'ago', 'the', 'nursing', 'profession']

In [128]:
ind_search = index_search(query, inv_idx, idf_dict, doc_norms)
ind_search

[(0.08030545787928534, 8),
 (0.017994055699712616, 755),
 (0.016270631367490414, 407),
 (0.01607313737355034, 32),
 (0.01537432882003561, 12),
 (0.015334000500833703, 561),
 (0.014680255116177723, 748),
 (0.014465136360897853, 539),
 (0.014396193676235139, 575),
 (0.014300990067858135, 31),
 (0.014294673177708791, 222),
 (0.014294389400598077, 225),
 (0.014253444776302582, 532),
 (0.014129579232983818, 534),
 (0.01385602279817329, 595),
 (0.013806061857709074, 354),
 (0.013737477225955207, 330),
 (0.013713634580604005, 360),
 (0.013623152459393604, 86),
 (0.013561784257643527, 164),
 (0.013486482371127735, 241),
 (0.013463369018376695, 511),
 (0.013457136949763669, 80),
 (0.01344770179049802, 450),
 (0.013443597054149515, 300),
 (0.013423699559451666, 546),
 (0.013326933342044205, 340),
 (0.01326253183978096, 494),
 (0.013248488716876783, 143),
 (0.013197443291071198, 434),
 (0.013130466548285744, 348),
 (0.013100041503679249, 295),
 (0.013097665968797335, 288),
 (0.013072399029930595,

In [131]:
def best_page_for(query):
    confidence,page_number = index_search(query, inv_idx, idf_dict, doc_norms)[0]
    text=listOfDicts[page_number]['text']
    print(f'confidence:{confidence}, page:{page_number}, text:{text}')

In [132]:
best_page_for(query)

confidence:0.08030545787928534, page:8, text:                                       contributor sheila marquez executive director vice presidentchief operating officer the colorado sids program  inc denver  colorado 
