In [2]:
%load_ext watermark
%watermark -a "Ruiyu Hu" -d -v -m

Ruiyu Hu 2019-03-18 

CPython 3.6.7
IPython 7.0.1

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 58 Stepping 9, GenuineIntel
CPU cores  : 4
interpreter: 64bit


In [3]:
# read the pdf file
import PyPDF2 

# tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob

# create inverse index

import math
import glob
import json
import operator
from collections import Counter,defaultdict



import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer


import warnings
warnings.filterwarnings('ignore')

**Create Tokenize**

In [4]:
#%%file tokenize.py
def get_file_names():
    files = []
    for file in glob.glob("../data/documents/*.pdf"):
        files.append(file)
    return files
#get_file_names()
###########################################################################
def convert(file):
    pdf_content = []

    pdf = PyPDF2.PdfFileReader(open(str(file),"rb"))
    # pdf may be more than one page
    num_pages = pdf.numPages
    count = 0
    text = ''
    while count < num_pages:
        pageObj = pdf.getPage(count)
        count +=1
        text += pageObj.extractText().replace('\n','')
    if text != '':
        text = text
            
    pdf_content.append(text)   
    return pdf_content

'''def convert(file):
    text = []
    with open(str(file), 'rb') as f:
        for line in f.readlines():
            text.append(line.decode("utf-8", "ignore").strip())
    return text'''

###########################################################################     
def tokenize(lst):            
    # create a list of token
    
    tokens = [None] * len(lst)
    for i in range(len(lst)):
        tokens[i] = clean_token(lst[i])
    tokens = [t for tok in tokens for t in tok] 
    return tokens

########################################################################### 
def clean_token(text):
    #porter = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    #snowball = nltk.SnowballStemmer('english')
        
    stopset = set(stopwords.words('english'))
    stopset.update(('less','year'))
    
    noun_lst = []

    for word,tag in (TextBlob(text).tags):
        if tag in ("NN", "NNS", "NNP", "NNPS","JJ"):
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            #word = porter.stem(word)
            #word = snowball.stem(word)
            if word not in stopset and word.isalpha() and len(word)>2:
                noun_lst.append(word)
    return noun_lst

**create index**

In [5]:
def make_idx(tokens, doc_name, idx, length):
    for term in set(tokens):
        idx[term].append([doc_name,tokens.count(term)])
        length[doc_name] = len(set(tokens))

def write(inverted_idx,len_idx):
    inv_idx_file = open("../data/indexes/inverted_idx.json","w")
    json.dump(inverted_idx,inv_idx_file)

    len_idx_file = open("../data/indexes/len_idx.json","w")
    json.dump(len_idx,len_idx_file)
    
def generate_idx():
    resume_files = get_file_names()
    inverted_index = defaultdict(list)
    length_index = defaultdict(list)
    
    for file in resume_files:
        make_idx(tokenize(convert(file)), file, inverted_index, length_index)
        
    write(inverted_index,length_index)
    print ("Indexes generated")
    

In [6]:
generate_idx()

Indexes generated


In [None]:
'''inv_idx_file = open("../data/indexes/inverted_idx.json","r")
inv_indx = json.load(inv_idx_file)
# create dictionary
doc_freq ={}
for key in sorted(inv_indx.keys()):
    doc_freq[key] = sum(Counter(set(doc_id for doc_id, term in inv_indx[key])).values())

dictionary = pd.DataFrame.from_dict(doc_freq,orient='index', columns=['DocFreq'])
dictionary.head()'''

**create retrieval-The BM25 Weighting Scheme**

In [7]:
from math import log

'''
IR Book: 11.4.3
Fomula: 11.33
'''
'''
typical TREC value (Text Retrieval Conference (TREC).)
f1 = 1.2
k2 varies from 0 to 1000
b = 0.75
'''

k1 = 1.2
b = 0.75
k2 = 100
R = 0 # (set it to 0 since no relevancy info is known)



def BM25(doc_len, avg_doc_len, n_doc_w_term, n_total_doc, freq_term_doc, freq_term_query, rel_doc_w_term):
    
    n = n_doc_w_term
    N = n_total_doc
    f = freq_term_doc
    q = freq_term_query
    r = rel_doc_w_term
    
    p1 = ((k2 + 1) * q) / (k2 + q) #Relevance between term and query
    p2 = ((k1 + 1) * f) / (getK(doc_len, avg_doc_len) + f) #Relevance between term and document
    p3 = log((((r + 0.5)/(R-r+0.5)) / ((n - r + 0.5)/(N - n - R + r + 0.5)))+1) # Term Weight
    return p1 * p2 * p3

def getK(doc_len, avg_doc_len):
    return k1 * ((1 - b) + b * (float(doc_len) / float(avg_doc_len)))
    
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [8]:
# get average document length
def get_avg_doc_len(len_idx):
    _length = 0
    for doc in len_idx:
        _length += len_idx[doc]
    return float(_length) / float(len(len_idx))

def search(query):
    inv_idx_file = open("../data/indexes/inverted_idx.json","r")
    inverted_idx = json.load(inv_idx_file)

    len_idx_file = open("../data/indexes/len_idx.json","r")
    len_idx = json.load(len_idx_file)

    scores = defaultdict(list)
    
    query_tokens = query.split()
    for token in query_tokens:
    #for token in query:
        #token = token.lower()
        for tok in clean_token(token):
            if tok in inverted_idx.keys():
                for entry in inverted_idx[tok]:
                    bm25_val = BM25(len_idx[entry[0]],get_avg_doc_len(len_idx),len(inverted_idx[tok]),len(len_idx),entry[1],1,0)
                #scores[entry[0]] = round(10* sigmoid(bm25_val)-5,4)
                    scores[entry[0]] = round(bm25_val,4)
    result = sorted(scores.items(),key=operator.itemgetter(1),reverse=True)
    return result

**w2v approach**

In [9]:
import time
from gensim.models import KeyedVectors
t1 = time.time()
#download link: https://github.com/mmihaltz/word2vec-GoogleNews-vectors
path = 'C:/Users/RayHu/Downloads/google_w2v/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(path, binary=True)
print('-------------------------------------------')
print("Loading word2vec model cost %.3f seconds...\n" % (time.time() - t1))


-------------------------------------------
Loading word2vec model cost 86.375 seconds...



In [10]:
#%%file vectorize.py
import numpy as np
from numpy import linalg as LA

def vectorize(words):
    '''
    transform the doc and query into vectors
    '''
    word_vec = []
    for word in words:
        try:
            vec = w2v_model[word]
            word_vec.append(vec)
        except KeyError:
            # ignore the word if it is not in the w2v vocablary
            pass
    vector = np.mean(word_vec,axis = 0)
    return vector

def cos_sim(vector1, vector2):
    '''
    the fomula to calculte cosine cimilarity 
    '''
    sim = np.dot(vector1, vector2) / (LA.norm(vector1) * LA.norm(vector2))
    
    if np.isnan(np.sum(sim)):
        return 0
    
    return sim

def calc_sim(query):
    '''
    calculate similarity scores between documents and the query
    '''
    query = clean_token(query)
    file_list = get_file_names()
    documents = {}
    
    for i in range(len(file_list)):
        documents[file_list[i]] = tokenize(convert(file_list[i]))
        
    query_vec = vectorize(query)
    results = {}
    
    for name, doc in documents.items():
        doc_vec = vectorize(doc)
        sim_score = cos_sim(query_vec, doc_vec)
        #threshold = 0.5
        if sim_score > 0:
            results[name] = sim_score
            sort_result = sorted(results.items(),key=operator.itemgetter(1),reverse=True)
    return sort_result

In [11]:
def jaccard(query, documents):
    s1 = set(query)
    s2 = set(documents)
    lst1 = s1.intersection(s2)
    lst2 = s1.union(s2)
    jaccard = 1.0 * len(lst1)/len(lst2)
    
    return jaccard

def calc_jac(query):
    query = set(clean_token(query))
    
    file_list = get_file_names()
    documents = {}
    
    for i in range(len(file_list)):
        documents[file_list[i]] = set(tokenize(convert(file_list[i])))
    
    results = {}
    
    for name, doc in documents.items():
        jac = jaccard(query, doc)
        results[name] = round(jac,4)
        sort_result = sorted(results.items(),key=operator.itemgetter(1),reverse=True)
    return sort_result

**create Ranker**

In [16]:
def matching(query):
    #print(query)
    print('-------------------------------------------------------')
    t1 = time.time()
    results_bm25 = search(query)[:3]
    print('bm25 result')
    for result in results_bm25:
        print(result)
    print('The computing cost %.3f seconds\n'% (time.time() - t1))
    print('-------------------------------------------------------')
    t2 = time.time()
    results_cos = calc_sim(query)[:3]
    print('cosine result')
    for result in results_cos:
        print(result)
    print('The computing cost %.3f seconds\n'% (time.time() - t2))   
    print('-------------------------------------------------------')
    t3 = time.time()
    results_jac = calc_jac(query)[:3]
    print('jacarrd result')
    for result in results_jac:
        print(result)
    print('The computing cost %.3f seconds\n'% (time.time() - t3))
    print('-------------------------------------------------------')

**test**

In [26]:
twitter_ds_jd = 'Apply advanced statistical techniques to model user behavior, identify causal impact and attribution, build and benchmark metrics.Write complex data flows using SQL, Spark, Scalding, R and Python scripts.Use data visualization tools (e.g, Tableau, Zeppelin) to share ongoing insights.'
adobe_ml_jd = 'Hands on experience with Java, Python, and/or C++.Work on machine learning models & algorithms, web services, distributed systems, data mining, big data, Hadoop, deep learning, recommendations, and more by developing a Machine Platform at Adobe that would power Adobe Clouds.Apply data mining and machine learning to improve content understanding, computer vision, deep learning, language understanding and content ranking & recommendations.Maintain and optimize machine learning platform, identify new ideas to evolve it, develop new features and benchmark possible solutions.Build machine learning capabilities using technologies such as REST web services, micro-services, Caffe, Tensorflow, Spark, Elastic, AWS, Kafka, Deep Learning, Matlab, R, and more.'

In [35]:
queries=[twitter_ds_jd, adobe_ml_jd]    
for query in queries:
    print(query)
    matching(query)
    continue

Apply advanced statistical techniques to model user behavior, identify causal impact and attribution, build and benchmark metrics.Write complex data flows using SQL, Spark, Scalding, R and Python scripts.Use data visualization tools (e.g, Tableau, Zeppelin) to share ongoing insights.
-------------------------------------------------------
bm25 result
('../data/documents\\ds002.pdf', 1.361)
('../data/documents\\de005.pdf', 1.3028)
('../data/documents\\ds001.pdf', 1.2891)
The computing cost 0.124 seconds

-------------------------------------------------------
cosine result
('../data/documents\\ds002.pdf', 0.80875266)
('../data/documents\\ds001.pdf', 0.80007637)
('../data/documents\\ds005.pdf', 0.797029)
The computing cost 3.852 seconds

-------------------------------------------------------
jacarrd result
('../data/documents\\ds003.pdf', 0.06626506024096386)
('../data/documents\\ds002.pdf', 0.06179775280898876)
('../data/documents\\de001.pdf', 0.058823529411764705)
The computing cost 3

In [36]:
queries=['machine learning','data visualization','insight analysis',
        'big data platform','familiar wth python, sql and r']    
for query in queries:
    print(query)
    matching(query)
    continue

machine learning
-------------------------------------------------------
bm25 result
('../data/documents\\ml002.pdf', 0.8568)
('../data/documents\\ml004.pdf', 0.8517)
('../data/documents\\ml003.pdf', 0.8479)
The computing cost 0.014 seconds

-------------------------------------------------------
cosine result
('../data/documents\\ml005.pdf', 0.6052575)
('../data/documents\\ml003.pdf', 0.577961)
('../data/documents\\ml001.pdf', 0.5635283)
The computing cost 3.776 seconds

-------------------------------------------------------
jacarrd result
('../data/documents\\ml005.pdf', 0.020833333333333332)
('../data/documents\\ml001.pdf', 0.019801980198019802)
('../data/documents\\ml003.pdf', 0.017094017094017096)
The computing cost 3.852 seconds

-------------------------------------------------------
data visualization
-------------------------------------------------------
bm25 result
('../data/documents\\de005.pdf', 1.7485)
('../data/documents\\ds002.pdf', 1.5699)
('../data/documents\\da005.p

## Search

In [60]:
print ("Enter search query")
keywords = input(":: ")
results = matching(keywords)

Enter search query


::  data analysis and data visualization


data analysis and data visualization
-------------------------------------------------------
bm25 result
('../data/documents\\de005.pdf', 1.748544547393187)
('../data/documents\\ds002.pdf', 1.5699147549098849)
('../data/documents\\da005.pdf', 1.0854099291059434)
('../data/documents\\da001.pdf', 1.0571608474905265)
('../data/documents\\ml002.pdf', 1.0245695126827221)
The computing cost 0.011 seconds

-------------------------------------------------------
cosine result
('../data/documents\\ds002.pdf', 0.75765777)
('../data/documents\\da001.pdf', 0.7421836)
('../data/documents\\de004.pdf', 0.74125904)
('../data/documents\\de003.pdf', 0.7226843)
('../data/documents\\ds004.pdf', 0.70230144)
The computing cost 4.812 seconds

-------------------------------------------------------
jacarrd result
('../data/documents\\de003.pdf', 0.022727272727272728)
('../data/documents\\da001.pdf', 0.022058823529411766)
('../data/documents\\ml005.pdf', 0.020618556701030927)
('../data/documents\\ml001.pdf', 0