In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import io
import re
from nltk.corpus import stopwords

In [3]:
def get_pages(txt_file):
    """
    [txt_file]: path to txt file
    return:
        Dict{page_num: page_text_string}
    """
    docs = []
    with io.open(txt_file, encoding = 'utf8') as f:
        txt_string = f.read()
    f.close()
    for page in txt_string.split('\n\n'):
        doc = re.sub('\n', ' ', page)
        if (doc != '') and (not doc.isspace()): # TODO doc_idx needs to correspond to pdf page_num
            docs.append(doc)
    docs = dict(enumerate(docs, start = 1))
    return docs

pages = get_pages('../streamlit_testing/pdftotext_result.txt')

In [4]:
def preprocess(pages):
    """
    Pre-process the raw pages

    [pages]: Dict{page_num: page_text_string}
    return:
        Dict{page_num: page_text_string}
    """
    processed_docs = {}
    for i in pages.keys():
        result = pages[i].lower()
        result = re.sub('[^0-9a-z]+', ' ', result) # replace non-alphanumeric chars with space
        result = result.split(' ')

        stopwords_set = set(stopwords.words('english'))
        result = [w for w in result if w.isalpha()] # remove non-letter tokens
        result = [w for w in result if len(w) >= 2] # remove tokens below a certain length
        result = [w for w in result if w not in stopwords_set] # remove stopwords
        # TODO stemming

        processed_docs[i] = ' '.join(result)
    return processed_docs

docs = preprocess(pages) # docs have same indices as pages

In [5]:
def get_tfidf_vectorizer(max_df = 0.9, min_df = 1, max_features = None):
    """
    return:
        tf-idf vectorizer
    """
    return TfidfVectorizer(max_df = max_df, min_df = min_df, max_features = max_features)

tfidf_vectorizer = get_tfidf_vectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(list(docs.values())).toarray()

In [6]:
def get_query_vector(query, tfidf_vectorizer):
    """
    [query]: string
    [tfidf_vectorizer]: tfidf vectorizer after fit_transform
    return:
        1d numpy.array of length = num_features(tfidf_vectorizer) representing the query as binary vector
    """
    # TODO non-binary representation of query vector
    features = tfidf_vectorizer.get_feature_names()
    inv_idx = {t:i for (i,t) in enumerate(features)}
    query_vec = np.zeros((len(features), ))
    for w in query.split(' '):
        try:
            query_vec[inv_idx[w]] = 1
        except KeyError:
            pass
    if not np.any(query_vec): # query vector is all zeros
        print('invalid query') # TODO better way to notify user
    return query_vec

In [7]:
def get_cosine_sim(query_vec, tfidf_matrix):
    """
    [query_vec]: query vector of shape (num_features, )
    [tfidf_matrix]: tf-idf matrix of shape (num_docs, num_features)
    return:
        1d numpy array of shape (num_docs, ) containing cosine similarity scores for query with each doc
    note: norm(query) is removed from equation since it's constant for all docs
    """
    norms_docs = np.linalg.norm(tfidf_matrix, axis = 1)
    dot_prods = np.dot(tfidf_matrix, query_vec)
    return dot_prods / norms_docs

In [13]:
def get_ranked_page_nums(cos_sims, top = 10):
    """
    [docs]: document dictionary {page_num: page_text_string}
    [cos_sims]: cosine similarity scores of shape (num_docs, )
    [top]: how many top results are returned
    return:
        [rankings]: ranked list of page numbers based on similarity
        [scores]: cosine similarity scores
    """
    inds = np.argsort(cos_sims)[-top:][::-1]
    rankings = np.array(list(docs.keys()))[inds]
    scores = cos_sims[inds]
    return (rankings, scores)

In [14]:
query = 'many years ago the nursing profession'
q = get_query_vector(query, tfidf_vectorizer)
cos_sims = get_cosine_sim(q, tfidf_matrix)
(rankings, scores) = get_ranked_page_nums(cos_sims)
for item in zip(rankings, scores):
    print(item)

(18, 0.5733842783319805)
(19, 0.3181071361441875)
(20, 0.3175908459382807)
(437, 0.27549381331335565)
(63, 0.2553825998026559)
(203, 0.23825996680561531)
(16, 0.2035434348548798)
(10, 0.18878403232224875)
(21, 0.1813544599727478)
(210, 0.17317707032765625)


In [None]:
# TODO use json to get paragraphs as documents