In [12]:
#top 5k dice keywords
KEY_WORDS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
TOPN           = 30
SYNONYMS_QRY_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top%i_keyword_synonyms.txt" % TOPN
SYNONYMS_INDEX_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keywords.txt"
PHRASES_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
MODEL_FILE     = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"

In [13]:
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

In [14]:
#functions
def is_valid_search_keyword(kw):
    q_kw = " " + kw + " "
    for wd in "(,), and , or , not , true , TRUE , false , FALSE ".split(","):
        if wd in q_kw:
            return False
    # remove queries with negations in them
    tokens = kw.split(" ")
    
    # remove single char keywords
    if len(tokens) == 1 and len(tokens[0]) == 1:
        if tokens[0].isalpha():
            return True
        return False
    
    if any(map(lambda t: t.strip().startswith("-"), tokens)):
        return False
    return True

def map_keyword(kw):
    return kw.replace(" ", "_")

def write_most_similar_synonyms(topn, key_words, phrases, model, expand_fname, map_fname):
    key_words = set(key_words)
    missing = set()
    no_sim = set()
    all_syns = set()
    with open(expand_fname, "w+") as exp_f:
        for word in key_words:
            if not word in model.vocab:
                missing.add(word)
                continue
            
            top_matches = model.most_similar(positive=word, topn=topn*10)
            valid = []
            for t,sim in top_matches:
                if t in phrases and sim > 0.01:
                    valid.append((t,sim))
                    if len(valid) >= topn:
                        break
                
            if len(valid) > 0:
                all_syns.add(word)
                exp_f.write("%s=>" % word)
                for key, val in valid:
                    all_syns.add(key)
                    kw = map_keyword(key)                        
                    exp_f.write("%s|%f " %(kw,val))
                exp_f.write("\n")
            else:
                no_sim.add(word)
                print("No matching similar terms in word2vec model for term: %s" % word)
    with open(map_fname, "w+") as f:
        for syn in sorted(all_syns):
            f.write("%s=>%s\n" % (syn, map_keyword(syn)))
    return all_syns, missing, no_sim

In [15]:
import gensim, time
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(MODEL_FILE)

In [16]:
phrases = load_stop_words(PHRASES_FILE)
len(phrases)

24785

In [17]:
keywords = []
un_keywords = set()
with open(KEY_WORDS_FILE) as f:
    for line in f:
        kw = line.strip()
        if len(kw) > 0 and is_valid_search_keyword(kw):
            keywords.append(kw)
print("%i keywords loaded from %s" % (len(keywords), KEY_WORDS_FILE))

4713 keywords loaded from /Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt


In [18]:
all_syns, missing, no_sim = write_most_similar_synonyms(TOPN, keywords, phrases, model, SYNONYMS_QRY_FILE, SYNONYMS_INDEX_FILE)

No matching similar terms in word2vec model for term: idc technologies


In [19]:
print len(missing), len(no_sim), len(keywords)

ranked_missing = []
for i,k in enumerate(keywords):
    if k in missing or k in no_sim:
        ranked_missing.append((i, k))

1552 1 4713


In [20]:
sorted(ranked_missing)[0:100]

[(68, 'etl tester'),
 (188, 'websphere admin'),
 (193, 'work from home'),
 (230, 'business analyst healthcare'),
 (246, 'qa manual tester'),
 (263, 'ruby on rails'),
 (264, 'java lead'),
 (270, 'sql server administrator'),
 (277, 'java remote'),
 (282, 'hadoop admin'),
 (311, 'oracle apps technical'),
 (316, 'sap tester'),
 (319, 'selenium tester'),
 (331, 'teradata developer'),
 (347, 'oracle soa developer'),
 (351, 'manual qa tester'),
 (357, 'weblogic administrator'),
 (399, 'dot net developer'),
 (401, 'director of it'),
 (427, 'junior business analyst'),
 (430, 'etl qa'),
 (436, 'mobile tester'),
 (453, 'soa developer'),
 (455, 'qa manual'),
 (458, 'qa selenium'),
 (463, 'sap manager'),
 (467, 'sas developer'),
 (469, 'informatica administrator'),
 (481, 'service now'),
 (494, 'sr. java developer'),
 (496, 'informatica admin'),
 (500, 'qa director'),
 (501, 'peoplesoft administrator'),
 (507, 'windows admin'),
 (517, 'sybase dba'),
 (535, 'rf engineer'),
 (548, 'oracle remote'),
 