In [1]:
import os, math
import pandas as pd
import numpy as np

In [2]:
from gensim.models import Word2Vec as w2v
from os import path as osp

root = osp.dirname(os.getcwd())
model = w2v.load(osp.join(root,'Model/stackexchange_model'))

In [3]:
def get_closest(word, n):
    #'''Get n most similar words by words.'''
    #This function can easily be expanded to get similar words to phrases--
    #using sent2vec() method defined in WithWord2Vec notebook. 
    word = word.lower()
    words = [word]
    similar_vals = [1]
    try:
        similar_list = model.wv.most_similar(positive=[word],topn=n)
        
        for tupl in similar_list:
            words.append(tupl[0])
            similar_vals.append(tupl[1])
    except:
        #If word not in vocabulary return same word and 1 similarity-- 
        #see initialisation of words, similarities.
        pass
    
    return words, similar_vals

In [4]:
prc_description = '''We are looking for a data scientist that will help us discover the information hidden in vast amounts of data, and help us make smarter decisions to deliver even better products. Your primary focus will be in applying data mining techniques, doing statistical analysis, and building high quality prediction systems integrated with our products.
Data mining using state-of-the-art methods
Extending company’s data with third party sources of information when needed
Enhancing data collection procedures to include information that is relevant for building analytic systems
Processing, cleansing, and verifying the integrity of data used for analysis
Doing ad-hoc analysis and presenting results in a clear manner
Creating automated anomaly detection systems and constant tracking of its performance'''

In [5]:
cvs = pd.read_csv('prc_data.csv', sep='\t')
cvs = cvs.set_index('Unnamed: 0')

In [6]:
word_value = {}
similar_words_needed = 2
for word in prc_description.split():
    similar_words, similarity = get_closest(word, similar_words_needed)
    for i in range(len(similar_words)):
        word_value[similar_words[i]] = word_value.get(similar_words[i], 0)+similarity[i]
        print(similar_words[i], word_value[similar_words[i]])

expand 1
expanding 0.6116359829902649
broaden 0.6102748513221741
existing 1
modifying 0.41968947649002075
alternate 0.41429176926612854
software 1
hardware 0.6065079569816589
tool 0.5780055522918701
to 1
and 0.4752591848373413
rythm 0.403042197227478
meet 1
meets 0.6222466826438904
satisfy 0.6012479066848755
the 1
a 0.5007174611091614
this 0.46612972021102905
changing 1
change 0.6755122542381287
switching 0.6297242045402527
needs 1
ought 0.6154512763023376
must 0.6089561581611633
of 1
thereof 0.46066388487815857
heightened 0.34523314237594604
our 1
ourselves 0.48375359177589417
my 0.43409448862075806
key 1
crucial 0.5341261625289917
main 0.5009962916374207
demographics 1
demographic 0.6179572343826294
population 0.5880870819091797
develop 1
acquire 0.682175874710083
build 0.6763089299201965
a 1.5007174611091614
the 1.5007174611091614
another 0.4662064015865326
new 1
newer 0.4527139961719513
fresh 0.39954105019569397
software 2
hardware 1.2130159139633179
tool 1.1560111045837402
product

In [7]:
no_of_cv = 50

count = {}
idf = {}
for word in word_value.keys():
    count[word] = 0
    for i in range(no_of_cv):
        try:
            if word in cvs.loc(0)['skill'][i].split() or word in cvs.loc(0)['exp'][i].split():
                count[word] += 1
        except:
            pass
    if (count[word] == 0):
        count[word] = 1
    idf[word] = math.log(no_of_cv/count[word])
#print(count)
#print(idf)idf
#print(idf[word])

In [8]:
score = {}
for i in range(no_of_cv):
    score[i] = 0
    try:
        for word in word_value.keys():
            tf = cvs.loc(0)['skill'][i].split().count(word) + cvs.loc(0)['exp'][i].split().count(word)
            score[i] += word_value[word]*tf*idf[word]
    except:
        pass

In [9]:
sorted_list = []
for i in range(no_of_cv):
    sorted_list.append((score[i], i))
    
sorted_list.sort(reverse = True)

for s, i in sorted_list:
    if list(cvs)[i] != '.DS_Store':
        print(list(cvs)[i], ':', s)

cv122 : 361.08770786621034
cv205 : 192.21739675185094
cv181 : 176.88872728408214
cv144 : 170.68662215533075
cv55 : 80.90065206536158
cv112 : 70.1301231421017
cv4 : 63.685244241067274
cv139 : 55.571801640450786
cv1 : 42.46495517965279
cv27 : 38.04741887243555
cv163 : 37.82316168702888
cv213 : 33.57174893435123
cv226 : 28.781067333598898
cv2 : 28.53277364093906
cv170 : 23.313097913871
cv45 : 23.000787913576687
cv221 : 22.46600598174771
cv156 : 17.34478425358107
cv248 : 16.167399416863333
cv34 : 16.058232074902495
cv223 : 13.047521169148748
cv155 : 11.359791024372974
cv132 : 10.4803850751023
cv171 : 9.384418248823742
cv6 : 9.286024424820507
cv157 : 8.82898340354683
cv35 : 8.40231431753937
cv14 : 7.146187780251234
cv183 : 6.664390732903593
cv234 : 6.601445470649894
cv40 : 5.729960611923361
cv230 : 5.177671800945734
cv31 : 5.150668843106291
cv17 : 4.857141389481439
cv16 : 4.034952985697392
cv161 : 3.7731936717103256
cv38 : 3.597835642328777
cv41 : 3.597835642328777
cv36 : 3.597835642328777
