In [1]:
import pandas as pd
import operator
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

In [2]:
df = pd.read_csv('data/job_ofer.csv')
df.shape

(36109, 8)

## Basic model

In [3]:
title_corpus = df.title.map(simple_preprocess)

In [4]:
def make_model(corpus, **kwargs):
    corpus_tagged = [TaggedDocument(sent, [i]) for i, sent in enumerate(corpus)]
    model = Doc2Vec(**kwargs)
    model.build_vocab(corpus_tagged)
    model.train(corpus_tagged, total_examples=model.corpus_count, epochs= 5)
    return(model)

In [5]:
title_model = make_model(title_corpus, vector_size = 300, window_size = 5, min_count = 1)

In [6]:
def print_with_similar(df, model, i):
    ms = model.docvecs.most_similar(i)
    idx = [i] + [ix for ix, _ in ms]
    print(df.loc[idx, 'title'])

In [7]:
print_with_similar(df, title_model, 0)

0                               Machine Learning Engineer
27698                       Machine Operator - IL06509141
207                    Machine Learning Research Position
317                           Machine Learning Researcher
318                    Machine Learning Senior Researcher
31954                                    Machine Operator
33091                     Machine Learning Cloud Engineer
320          Data Science and Machine Learning Researcher
209                    Machine Learning Research Position
4910     Seasonal 2nd Shift Machine Operator - Poland, ME
31660                                    Machine Operator
Name: title, dtype: object


## Model with phrases

In [8]:
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))

In [9]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield  bigram[sent] + sent

In [10]:
ext_corpus_title = list(prepare_corpus(title_corpus, title_bigram))

In [11]:
title_ext_model = make_model(ext_corpus_title, vector_size = 300, window_size = 5, min_count = 1)

In [12]:
print_with_similar(df, title_ext_model, 0)

0                                Machine Learning Engineer
7139     Senior Machine Learning Engineer (Relocate to ...
7243     Senior Machine Learning Engineer (Relocate to ...
209                     Machine Learning Research Position
234                                       Researcher, U.S.
7452     Senior Machine Learning Engineer (Relocate to ...
10128                 Supplier Engineer, China Integration
25426                     Senior Machine Learning Engineer
35431                     Copy Editor => Relocate to China
13654    Beijing/Shanghai/Shenzhen UP TO 30k/M => Reloc...
31929            Editorial Internship => Relocate to China
Name: title, dtype: object
