In [1]:
file = './Nurse.pdf'

In [22]:
import pandas as pd
import numpy as np
import numpy.linalg as npl
import pdftotext
import re
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
with open(file, "rb") as f:
    pdf = pdftotext.PDF(f)

In [7]:
def clean_string(text):
    '''
    Pre-process text
    input: string
    output: string
    '''
    # TODO better cleaning/pre-processing
    sub = ''
    text = text.lower()
    text = re.sub(',', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('\[.*?\]', sub, text) #brackets
    text = re.sub('[%s]' % re.escape(string.punctuation), sub, text) #punctions
    text = re.sub('\w*\d\w*', sub, text) #digits
    text = re.sub('[’’“”…]', sub, text) #quotes
    text = re.sub('\n', ' ', text) #newlines
    text = re.sub('♪', sub, text) #symbols
    text = re.sub('–', sub, text) #dashes
    return text

In [8]:
def pdf_to_listOfDicts(pdf):
    result = []
    for ind,i in enumerate(pdf):
        cleaned_string =  clean_string(i)
        result.append({'page':ind,'text':cleaned_string,'toks':list(set(cleaned_string.split()))})
    return result

In [9]:
listOfDicts = pdf_to_listOfDicts(pdf)

# Question 1 (Code Completion): Using scikit-learn to get tf-idf vectors

In [11]:
[i['text'] for i in listOfDicts]

['',
 '    gordons functional health                      patterns through  health perception health management pattern health maintenance  ineffective  therapeutic regimen effective management  therapeutic regimen ineffective management  therapeutic regimen readiness for enhanced management  therapeutic regimen family  ineffective management  therapeutic regimen community  ineffective management  noncompliance specify  health seeking behaviors specify  energy field  disturbed  falls  risk for  infection  risk for  injury trauma  risk for  protection  ineffective  poisoning  risk for  suffocation  risk for  perioperative positioning injury  risk for  sudden infant death syndrome  nutritional metabolic pattern nutrition more than body requirements  imbalanced  nutrition more than body requirements  risk for imbalanced  nutrition less than body requirements  imbalanced  nutrition  readiness for enhanced  breastfeeding  ineffective  breastfeeding  effective  breastfeeding  interrupted  in

In [23]:
def build_vectorizer(max_n_terms=5000, max_prop_docs=0.8, min_n_docs=10):
    """Returns a TfidfVectorizer object with certain preprocessing properties.
    
    Params: {max_n_terms: Integer,
             max_prop_docs: Float,
             min_n_docs: Integer}
    Returns: TfidfVectorizer
    """
    # YOUR CODE HERE
#     raise NotImplementedError()
    return TfidfVectorizer(max_features=max_n_terms, 
                           stop_words = "english", 
                           min_df = min_n_docs,
                           max_df = max_prop_docs,
                           norm = 'l2')

In [24]:
tfidf_vec = build_vectorizer()
tfidf_mat = tfidf_vec.fit_transform([i['text'] for i in listOfDicts]).toarray()

In [25]:
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}

In [26]:
index_to_vocab

{0: 'abandonment',
 1: 'abdomen',
 2: 'abdominal',
 3: 'abgs',
 4: 'abilities',
 5: 'ability',
 6: 'able',
 7: 'abnormal',
 8: 'abnormalities',
 9: 'absence',
 10: 'absent',
 11: 'abuse',
 12: 'accept',
 13: 'acceptable',
 14: 'acceptance',
 15: 'accepted',
 16: 'access',
 17: 'accessory',
 18: 'accident',
 19: 'according',
 20: 'accumulation',
 21: 'accurate',
 22: 'achieve',
 23: 'achieving',
 24: 'acidosis',
 25: 'acknowledge',
 26: 'act',
 27: 'acting',
 28: 'action',
 29: 'actions',
 30: 'actionsinterventions',
 31: 'active',
 32: 'activi',
 33: 'activities',
 34: 'activity',
 35: 'activityexercise',
 36: 'activityrest',
 37: 'actual',
 38: 'acuity',
 39: 'acute',
 40: 'acutechronic',
 41: 'adaptation',
 42: 'adaptive',
 43: 'added',
 44: 'addiction',
 45: 'addition',
 46: 'additional',
 47: 'address',
 48: 'addressed',
 49: 'adequate',
 50: 'adjust',
 51: 'adjustment',
 52: 'adls',
 53: 'administer',
 54: 'administration',
 55: 'adolescence',
 56: 'adolescent',
 57: 'adult',
 58:

# Question 2: Cosine similarity

In [None]:
def get_cos_sim(mov1, mov2, input_doc_mat, 
                movie_name_to_index=movie_name_to_index):
    """Returns the cosine similarity of two movie scripts.
    
    Params: {mov1: String,
             mov2: String,
             input_doc_mat: np.ndarray,
             movie_name_to_index: Dict}
    Returns: Float 
    """
    # YOUR CODE HERE
#     raise NotImplementedError()
    movie_1_index = movie_name_to_index[mov1]
    movie_2_index = movie_name_to_index[mov2]
    num = input_doc_mat[movie_1_index]@input_doc_mat[movie_2_index]
    den = npl.norm(input_doc_mat[movie_1_index])*npl.norm(input_doc_mat[movie_2_index])
    return num/den