In [None]:
%load_ext watermark

In [None]:
%watermark -a "Ruiyu Hu" -d -v -m

In [None]:
# read the pdf file
import PyPDF2 

# tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# create inverse index
from collections import defaultdict

import glob
import json
import operator

**Create Tokenize**

In [None]:
def clean_token(text):
    #porter = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = text.lower() # case-folding (of the whole text string)
    tokens = word_tokenize(tokens) # default tokenizer
    tokens = [w for w in tokens if w not in stopwords.words('english')] # filter English stopwords
    #tokens = [w for w in tokens if len(w) > 2]
    #tokens = [porter.stem(tok) for tok in tokens] # apply stemmer
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    tokens = [w for w in tokens if w.isalpha()] # filter tokens that contain non-alphabetic character(s)
    return tokens

def tokenize(path):
    # open PDF
    pdf = PyPDF2.PdfFileReader(open(str(path),"rb"))
    stopword_list = list(stopwords.words("english"))

    # read PDF file in a list
    pdf_content = []
    for page in pdf.pages:
        pdf_content.append(page.extractText())
    
    # create a list of token
    tokens = [None] * len(pdf_content)
    for i in range(len(pdf_content)):
        tokens[i] = clean_token(pdf_content[i])
    tokens = [t for tok in tokens for t in tok] 
    return tokens

In [None]:
#lst = tokenize('../data/documents/resume test.pdf')
#set(lst)

**create index**

In [None]:
def get_file_names():
    files = []
    #'../data/solarhrm*.pdf'
    for file in glob.glob("../data/documents/*.pdf"):
        files.append(file)
    return files

def make_index(tokens, document_name, index, length):
    for term in set(tokens):
        index[term].append([document_name,tokens.count(term)])
        length[document_name] = len(set(tokens))

def write(inverted_index,length_index):
    inv_index_file = open("../data/indexes/inverted_index.json","w")
    json.dump(inverted_index,inv_index_file)

    length_index_file = open("../data/indexes/length_index.json","w")
    json.dump(length_index,length_index_file)
    
def generator():
    resume_files = get_file_names()
    inverted_index = defaultdict(list)
    length_index = defaultdict(list)
    for file in resume_files:
        make_index(tokenize(file), file, inverted_index, length_index)
    write(inverted_index,length_index)
    print ("Indexes generated")

In [None]:
generator()

**create retrieval-The BM25 Weighting Scheme**

In [None]:
from math import log

'''
https://en.wikipedia.org/wiki/Okapi_BM25
BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, 
regardless of the inter-relationship between the query terms within a document 

Using the following formula to calculate BM25
((k3 + 1)q)/((k3 + q)) * ((k1 + 1)f)/((K + f)) * log((r + 0.5)(N − n − R + r + 0.5))/((n − r + 0.5)(R − r + 0.5))

'''

# still test the pre-value here
k1 = 1.2
b = 0.75
k2 = 100
R = 0 #Since no relevance info is available

# MAIN METHOD

def BM25(docLen, avDocLen, n, N, f, q, r):
    p1 = ((k2 + 1) * q) / (k2 + q)
    p2 = ((k1 + 1) * f) / (getK(docLen, avDocLen) + f)
    p3 = log((r + 0.5) * (N - n - R + r + 0.5)) / ((n - r + 0.5) * (R - r + 0.5))
    return p1 * p2 * p3

def getK(docLen, avDocLen):
    return k1 * ((1 - b) + b * (float(docLen) / float(avDocLen)))

**create Ranker**

In [None]:
# get average document length
def get_avdl(length_index):
    corpus_length = 0
    for document in length_index:
        corpus_length += length_index[document]
    return float(corpus_length) / float(len(length_index))

def search(query):
    inv_index_file = open("../data/indexes/inverted_index.json","r")
    inverted_index = json.load(inv_index_file)

    length_index_file = open("../data/indexes/length_index.json","r")
    length_index = json.load(length_index_file)

    scores = defaultdict(list)
    query_tokens = query.split()
    for token in query_tokens:
        for entry in inverted_index[token]:
            scores[entry[0]] = BM25(length_index[entry[0]],get_avdl(length_index),len(inverted_index[token]),len(length_index),entry[1],1,0)
    return sorted(scores.items(),key=operator.itemgetter(1),reverse=True)

In [None]:
def matching(keyword):
    results = search(keyword)
    
    for result in results:
        print(result)

keyword = 'machine learning'
matching(keyword)