In [None]:
%load_ext watermark

In [None]:
%watermark -a "Ruiyu Hu" -d -v -m

In [31]:
# read the pdf file
import PyPDF2 

# tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# create inverse index
from collections import defaultdict

import glob
import json
import operator
from collections import Counter

#for google api 
# set up env locally
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'C:\Users\RayHu\ruiyu-gcp-4ac10836d3b1.json' 

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types


import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer


import warnings
warnings.filterwarnings('ignore')



**Create Tokenize**

In [205]:
def clean_token(text):
    #porter = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = text.lower() # case-folding (of the whole text string)
    tokens = word_tokenize(tokens) # default tokenizer
    tokens = [w for w in tokens if w not in stopwords.words('english')] # filter English stopwords
    #tokens = [w for w in tokens if len(w) > 2]
    #tokens = [porter.stem(tok) for tok in tokens] # apply stemmer
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    tokens = [w for w in tokens if w.isalpha()] # filter tokens that contain non-alphabetic character(s)
    return tokens

def tokenize(path):
    # open PDF
    pdf = PyPDF2.PdfFileReader(open(str(path),"rb"))
    #stopword_list = list(stopwords.words("english"))

    #read PDF file in a list
    pdf_content = []
    for page in pdf.pages:
        pdf_content.append(page.extractText())
    
    # create a list of token
    
    tokens = [None] * len(pdf_content)
    for i in range(len(pdf_content)):
        tokens[i] = clean_token(pdf_content[i])
    tokens = [t for tok in tokens for t in tok] 
    
    return tokens

**create index**

In [206]:
def get_file_names():
    files = []
    #'../data/solarhrm*.pdf'
    for file in glob.glob("../data/documents/*.pdf"):
    #for file in glob.glob('../data/documents/*.txt'):
        files.append(file)
    return files

def make_index(tokens, document_name, index, length):
    for term in set(tokens):
        index[term].append([document_name,tokens.count(term)])
        length[document_name] = len(set(tokens))

# saving index into json        
def write(inverted_index,length_index):
    inv_index_file = open("../data/indexes/inverted_index.json","w")
    json.dump(inverted_index,inv_index_file)

    length_index_file = open("../data/indexes/length_index.json","w")
    json.dump(length_index,length_index_file)
    
def generator():
    resume_files = get_file_names()
    inverted_index = defaultdict(list)
    length_index = defaultdict(list)
    for file in resume_files:
        make_index(tokenize(file), file, inverted_index, length_index)
    write(inverted_index,length_index)
    print ("Indexes generated")

In [207]:
dirname, filename = os.path.split(path)

Indexes generated


**create retrieval-The BM25 Weighting Scheme**

**Formula**


For query Q and document d, we have BM25 d of Q:

$ score(D,Q)=\frac{(k_2+1)qf_i}{k_2+qf_i}\times \frac{(k_1+1)\times f_i}{f_i+K}\times ln(\frac{(r_i+0.5)/(R-r_i+0.5)}{(n_i-r_i+0.5)/(N-n_i-R+r_i+0.5)}+1)$

> Reference: Elasticsearch and IR Text Book Fomula 11.33

* $r_i$ is the # of relevant documents containing term i 
* $n_i$  is the # of docs containing term i
* $N$ is the total # of docs in the collection
* $R$ is the number of relevant documents for this query  (set to 0 if no relevancy info is known)
* $f_i$  is the frequency of term i in the doc under consideration
* $qf_i$ is the frequency of term i in the query
* $k_1$ determines how the tf component of the term weight changes as $f_i$
  increases. (if 0, then tf component is ignored.) 
* $k_2$ Typical values make the equation less sensitive to k2 than k1 because query term frequencies are much lower and less variable than doc term frequencies.
* $K$ It equals to ($k_1(1-b+b\times l_d/avg\_l)$). Its role is basically to normalize the tf component by document length.
* $b$ regulates the impact of length normalization. (0 means none; 1 is full normalization.) 
* $ln$: if $n_i$ > $\frac{N}{2}$, the result of $ln$ could be negative. Therefore, we plus 1 here


In [352]:
from math import log

'''
IR Book: 11.4.3
Fomula: 11.33
'''
'''
typical TREC value (Text Retrieval Conference (TREC).)
f1 = 1.2
k2 varies from 0 to 1000
b = 0.75
'''

k1 = 1.2
b = 0.75
k2 = 100
R = 0 # (set it to 0 since no relevancy info is known)


def BM25(docLen, avDocLen, n, N, f, q, r):
    '''
        #docLen, avDocLen, n, N, f, q, r
        #𝑛  is the # of docs containing term i
        #N is the total # of docs in the collection
        #f is the frequency of term i in the doc under consideration
        #𝑞𝑓𝑖  is the frequency of term i in the query
        #𝑟𝑖  is the # of relevant documents containing term i
    '''
    p1 = ((k2 + 1) * q) / (k2 + q) #Relevance between term and query
    p2 = ((k1 + 1) * f) / (getK(docLen, avDocLen) + f) #Relevance between term and document
    p3 = log((((r + 0.5)/(R-r+0.5)) / ((n - r + 0.5)/(N - n - R + r + 0.5)))+1) # Term Weight
    return p1 * p2 * p3

def getK(docLen, avDocLen):
    return k1 * ((1 - b) + b * (float(docLen) / float(avDocLen)))

**create Ranker**

In [379]:
# get average document length
def get_avdl(length_index):
    corpus_length = 0
    for document in length_index:
        corpus_length += length_index[document]
    return float(corpus_length) / float(len(length_index))

import math
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def search(query):
    inv_index_file = open("../data/indexes/inverted_index.json","r")
    inverted_index = json.load(inv_index_file)

    length_index_file = open("../data/indexes/length_index.json","r")
    length_index = json.load(length_index_file)

    scores = defaultdict(list)
    query_tokens = query.split()
    for token in query_tokens:
    #for token in query:
        if token in inverted_index.keys():
            for entry in inverted_index[token]:
                bm25_val = BM25(length_index[entry[0]],get_avdl(length_index),len(inverted_index[token]),len(length_index),entry[1],1,0)
                scores[entry[0].replace('../data/documents\\','')] = round((10* sigmoid(bm25_val)-5),2)
    result = sorted(scores.items(),key=operator.itemgetter(1),reverse=True)
    #result = sorted(norm_scores.items(), key = operator.itemgetter(1), reverse = True)
    return result

In [380]:
def matching(keyword):
    #generator()
    results = search(keyword)[:5]
    
    for result in results:
        print(result)

In [381]:
results = matching('Experience with supervised and unsupervised machine learning algorithms, and ensemble methods, such as: K-Means, PCA, Regression, Neural Networks, Decision Trees, Gradient Boosting')

('ml005.pdf', 2.73)
('ml002.pdf', 2.68)
('ml001.pdf', 2.54)
('ds004.pdf', 2.4)
('ds002.pdf', 2.32)


## Jd side

In [None]:
def clean_jd_1(dirt):
    '''
    Method 1: only keep noun in the job description
    '''
    lst = tokenize(dirt)
    
    client = language.LanguageServiceClient()
    # part-of-speech tags from list(enums.PartOfSpeech.Tag)
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
    tags = ['NOUN']
    
    output = []
    for _ in lst:
        #output =[]
        #doc = ' '.join(_)
        document = language.types.Document(content = _, type=enums.Document.Type.PLAIN_TEXT)
        tokens = client.analyze_syntax(document).tokens
        for token in tokens:
            if pos_tag[token.part_of_speech.tag] in tags:
                output.append(token.text.content)
                
    c = Counter(output)
    query = [key for key, val in c.most_common(20)]
    
    return query

In [212]:
def clean_jd_2(dirt):
    '''
    method 2: terms sorted by tf idf weights
    '''
    doc = tokenize(dirt)
    
    cv=CountVectorizer(stop_words=stopwords.words('english'))
    word_count_vector=cv.fit_transform(doc)
    
    #calculate the weights for each term in each document
    tfidf_transformer=TfidfTransformer()
    tf_idf_vector = tfidf_transformer.fit_transform(word_count_vector)
    #the top 20 terms by average tf-idf weight
    weights = np.asarray(tf_idf_vector.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights}).sort_values(by='weight', ascending=False).head(20)
    return weights_df.term.tolist()

## Search

In [213]:
dirt = '../data/job_description/ml-jd-adobe.pdf'

In [216]:
# method 1
#search(clean_jd_1(dirt))

In [215]:
# method 2
print(search(clean_jd_2(dirt)))

[('../data/documents\\da002.pdf', 1.9554483101714493), ('../data/documents\\ds002.pdf', 1.939375031370122), ('../data/documents\\ml001.pdf', 1.5029566680437219), ('../data/documents\\de005.pdf', 1.170008090489953), ('../data/documents\\de001.pdf', 0.0), ('../data/documents\\de004.pdf', 0.0), ('../data/documents\\ds001.pdf', 0.0), ('../data/documents\\ds003.pdf', 0.0), ('../data/documents\\ds004.pdf', 0.0), ('../data/documents\\ml002.pdf', 0.0), ('../data/documents\\da003.pdf', 0.0), ('../data/documents\\da001.pdf', 0.0), ('../data/documents\\ml004.pdf', 0.0), ('../data/documents\\ds005.pdf', -0.17746847325794762), ('../data/documents\\da004.pdf', -0.18782871493816702), ('../data/documents\\ml003.pdf', -0.2026139425178263), ('../data/documents\\de003.pdf', -0.2162306887947264), ('../data/documents\\de002.pdf', -0.2892254673064674), ('../data/documents\\da005.pdf', -0.33071276322252563), ('../data/documents\\ml005.pdf', -2.9816561435328213)]


## approach 2 test

In [None]:
jd = pd.read_json('../data/jd/data_scientist.json').T
jd.head()

In [None]:
from pprint import pprint
pprint(jd.posting[0])

In [None]:
jd.posting[1]

In [None]:
pprint(jd.posting[2])

In [None]:
pprint(jd.posting[13])

In [None]:
from textblob import TextBlob

In [None]:
test = TextBlob("Python is a high-level, general-purpose machine learning language.")

In [None]:
test.tags

In [None]:
test.noun_phrases

In [None]:
test.sentiment