In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec
from gensim.models import Word2Vec
from pymongo import MongoClient
import faiss
import numpy as np

In [76]:
def d2vTagData(corpus, names=True):
    itr = 0
    if names:
        for i in corpus:
            if type(i) == str:
                tag = i
                i = i.replace("-", " ")
                i = i.lower()
                tokens = gensim.utils.simple_preprocess(i)
                yield gensim.models.doc2vec.TaggedDocument(tokens, [tag])
    else:
        for doc in corpus:
            item = str(doc['pros']) + " " + str(doc['cons'])
            item = item.replace('\n', '')
            item = item.replace('\r', '')
            tokens = gensim.utils.simple_preprocess(item)
            #tag = [date, itr]
            tag = doc['_id']
            yield gensim.models.doc2vec.TaggedDocument(tokens, [tag])

def d2vTagCustom(corpus):
    itr = 0
    for doc in corpus:
        firm = doc['_id']['firm']
        title = doc['_id']['job_title']
        tag = firm + "|" + str(title)
        item = " ".join(doc['pros']) + " " + " ".join(doc['cons'])
        tokens = gensim.utils.simple_preprocess(item)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [tag])

In [31]:
def train_embed(data, vector_size=64, epochs=1000, window=1, min_ct=1, trim_len=1, name='d2v_embed'):
    def trimrule(word, count, min_count):
        if len(word) < trim_len:
            return gensim.utils.RULE_DISCARD
    
        if count < min_count:
            return gensim.utils.RULE_DISCARD
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_ct, dbow_words=0)
    model.build_vocab(data, trim_rule=trimrule)
    model.train(data, epochs=epochs, total_examples=len(data))
    model.save(name)
    return model

In [4]:
client = MongoClient("mongodb://localhost:27017/")
db = client['employaid']

### Extracting Sample of 10K from main dataset

In [None]:
gd_sub = list(db.glassdoor.aggregate([{'$sample': {'size': 10000}}, {'$match': {'job_title': {'$not': {'$type': ['number', 'decimal']}}}}]))
db.gd_sub.insert_many(gd_sub)

### Aggregating pros and cons by Company Name

In [None]:
pipe = [{'$group': {'_id': '$firm', 'pros': {'$addToSet': '$pros'}, 'cons': {'$addToSet': '$cons'}}}]

firm_pros_cons = list(db.gd_sub.aggregate(pipe))

db.firm_pros_cons.insert_many(firm_pros_cons)

### Creating Embeddings for Company Names and their Pros & Cons

In [38]:
co_names = list(db.firm_pros_cons.find({}, {'_id': 1}))

co_names = [n['_id'] for n in co_names]
corpus = list(d2vTagData(co_names, names=True))

print(corpus[:10])
firm_name_model = train_embed(corpus, vector_size=64, epochs=3000, name='firm_name')



[TaggedDocument(words=['hampshire', 'county', 'council'], tags=['Hampshire-County-Council']), TaggedDocument(words=['met', 'office'], tags=['Met-Office']), TaggedDocument(words=['ibm'], tags=['IBM']), TaggedDocument(words=['boots', 'opticians'], tags=['Boots-Opticians']), TaggedDocument(words=['university', 'of', 'liverpool'], tags=['University-of-Liverpool']), TaggedDocument(words=['brewin', 'dolphin'], tags=['Brewin-Dolphin']), TaggedDocument(words=['student', 'loans', 'company'], tags=['Student-Loans-Company']), TaggedDocument(words=['the', 'department', 'for', 'education', 'uk'], tags=['The-Department-for-Education-UK']), TaggedDocument(words=['east', 'sussex', 'county', 'council'], tags=['East-Sussex-County-Council']), TaggedDocument(words=['nhs', 'professionals'], tags=['NHS-Professionals'])]


In [45]:
for key in firm_name_model.dv.index_to_key:
    db.firm_pros_cons._one({'_id': key}, {'$set': {'title_embed': firm_name_model.dv[key].tolist()}})

### Extracting Ratings from dataset, and store them as a vector in sub-dataset

In [49]:
pipe = [{"$group": {"_id": "$firm", "count": {"$sum": 1}, 
                    "avg_overall": {"$avg": "$overall_rating"}, 
                    "avg_worklife": {"$avg": "$work_life_balance"},
                    "avg_culture": {"$avg": "$culture_values"},
                    "avg_career": {"$avg": "$career_opp"},
                    "avg_comp": {"$avg": "$comp_benefits"},
                    "avg_mgmt": {"$avg": "$senior_mgmt"}}}]
co_ratings = list(db.gd_sub.aggregate(pipe))

for i in co_ratings:
    vect = [int(i['count']), float(i['avg_overall']) if i['avg_overall'] is not None else -1,
            float(i['avg_worklife']) if i['avg_worklife'] is not None else -1,
            float(i['avg_culture']) if i['avg_culture'] is not None else -1,
            float(i['avg_career']) if i['avg_career'] is not None else -1,
            float(i['avg_comp']) if i['avg_comp'] is not None else -1,
            float(i['avg_mgmt']) if i['avg_mgmt'] is not None else -1]
    db.firm_pros_cons.update_one({'_id': i['_id']}, {'$set': {'rating_embed': vect}})


### Create Embeddings for Firm Pros & Cons and store them in sub-dataset

In [51]:
firm_pros_cons = list(db.firm_pros_cons.find())

corpus = list(d2vTagData(firm_pros_cons, names=False))

firm_procon_model = train_embed(corpus, vector_size=128, epochs=3000, name='firm_pros_cons')


In [52]:
for i in firm_procon_model.dv.index_to_key:
    db.firm_pros_cons.update_one({'_id': i}, {'$set': {'procon_embed': firm_procon_model.dv[i].tolist()}})    

### Aggregating Pros and Cons by Job Title

In [None]:
pipe = [{'$group': {'_id': '$job_title', 'pros': {'$addToSet': '$pros'}, 'cons': {'$addToSet': '$cons'}}}]

title_pros_cons = list(db.gd_sub.aggregate(pipe))

db.title_pros_cons.insert_many(title_pros_cons)

### Create Embeddings for Job Title and store in dataset

In [55]:
titles = list(db.title_pros_cons.find({}, {'_id': 1}))

titles = [n['_id'] for n in titles]

corpus = list(d2vTagData(titles, names=True))

title_model = train_embed(corpus, vector_size=64, epochs=3000, name='job_title')



In [56]:
for title in title_model.dv.index_to_key:
    db.title_pros_cons.update_one({'_id': title}, {'$set': {'title_embed': title_model.dv[title].tolist()}})
    

### Create Embeddings for Job Title pros and cons and store in dataset

In [58]:
title_pros_cons = list(db.title_pros_cons.find())

corpus = list(d2vTagData(title_pros_cons, names=False))

title_pros_cons_model = train_embed(corpus, vector_size=128, epochs=3000, name='title_pros_cons')

In [59]:
for i in title_pros_cons_model.dv.index_to_key:
    db.title_pros_cons.update_one({'_id': i}, {'$set': {'procon_embed': title_pros_cons_model.dv[i].tolist()}})

### Aggregating Job Title by Firm and store them in sub-dataset

In [None]:
pipe = [{"$group": {"_id": {"firm": "$firm", "job_title": "$job_title"}, "count": {"$sum": 1}, "pros": {"$addToSet": "$pros"}, 
                                                                                                        "cons": {"$addToSet": "$cons"}}}]

firm_job = list(db.gd_sub.aggregate(pipe))
db.firm_job.insert_many(firm_job)

### Create Embeddings for Job Title by Firm pros and cons and store in dataset

In [77]:
firm_job_data = list(db.firm_job.find())
corpus = list(d2vTagCustom(firm_job_data))
firm_job_model = train_embed(corpus, vector_size=128, epochs=3000, name='firm_job_model')

In [78]:

for key in firm_job_model.dv.index_to_key:
    key_items = key.split('|')
    make_id = {'firm': key_items[0], 'job_title': key_items[1]}
    db.firm_job.update_one({'_id': make_id}, {'$set': {'total_embed': firm_job_model.dv[key].tolist()}})