In [1]:
import os
import re
import pandas as pd
from time import time
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
import matplotlib.pyplot as plt
import spacy

In [2]:
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
#find current working directory
directory = os.getcwd()

#specifying csv filename
#filename = "\\job_title.csv"
filename = "\\titles_final.csv"

#using concat to generate fullpath
file = directory+filename

#load csv file containing job titles
df = pd.read_csv(file)

In [4]:
#a little data refactoring step

df.isnull().sum()
df.dropna()
# df = df.drop_duplicates(keep='first', inplace=False, ignore_index=False) #this could be optional

Unnamed: 0,Titles
0,Senior Product Manager
1,Solutions Engineer
2,Staff Software Engineer
3,Head of Product (Platform)
4,Incubation Lead Success Cloud
...,...
298391,IT Analyst
298392,Concertmaster
298393,Technical Lead
298394,Sales Associate


In [5]:
#continued

filter_lowercase = df['Titles'].str.lower()

#removing punctuations and extra spacing between words

#df["Titles"] = filter_lowercase.str.replace('[^\w\s]', '')
df["Titles"] = filter_lowercase.str.replace('\s+', ' ', regex=True)

In [6]:
df['Titles'].head 

<bound method NDFrame.head of 0                                    senior product manager
1                                        solutions engineer
2                                   staff software engineer
3                                head of product (platform)
4                             incubation lead success cloud
                                ...                        
298391                                           it analyst
298392                                        concertmaster
298393                                       technical lead
298394                                      sales associate
298395    chair at model united nations at the liberal a...
Name: Titles, Length: 298396, dtype: object>

In [7]:
#converting each element into it's own list - doing this because we need individual job post as its own element

def listoflists(inputlist):
    return [[el] for el in inputlist]

nestedlist = listoflists(df['Titles'])

#reinitializing the dataset again

df = pd.DataFrame({'Titles':nestedlist})
df.head

<bound method NDFrame.head of                                                    Titles
0                                [senior product manager]
1                                    [solutions engineer]
2                               [staff software engineer]
3                            [head of product (platform)]
4                         [incubation lead success cloud]
...                                                   ...
298391                                       [it analyst]
298392                                    [concertmaster]
298393                                   [technical lead]
298394                                  [sales associate]
298395  [chair at model united nations at the liberal ...

[298396 rows x 1 columns]>

In [8]:
#can we sort the dataframe?

df.sort_values(by=['Titles'], ascending=True, inplace=True)


In [9]:
#Phrases, Phraser, bigram

from gensim.models.phrases import Phrases, Phraser

In [10]:
phrases = Phrases(df["Titles"], min_count=2, progress_per=10000)

INFO - 21:50:48: collecting all words and their counts
INFO - 21:50:48: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 21:50:48: PROGRESS: at sentence #10000, processed 10000 words and 4223 word types
INFO - 21:50:48: PROGRESS: at sentence #20000, processed 20000 words and 8966 word types
INFO - 21:50:48: PROGRESS: at sentence #30000, processed 30000 words and 14581 word types
INFO - 21:50:48: PROGRESS: at sentence #40000, processed 40000 words and 18938 word types
INFO - 21:50:48: PROGRESS: at sentence #50000, processed 50000 words and 23796 word types
INFO - 21:50:48: PROGRESS: at sentence #60000, processed 60000 words and 29120 word types
INFO - 21:50:48: PROGRESS: at sentence #70000, processed 70000 words and 33976 word types
INFO - 21:50:48: PROGRESS: at sentence #80000, processed 80000 words and 39233 word types
INFO - 21:50:48: PROGRESS: at sentence #90000, processed 90000 words and 43985 word types
INFO - 21:50:48: PROGRESS: at sentence #100000, processed 1

In [11]:
print(phrases)

Phrases<119165 vocab, min_count=2, threshold=10.0, max_vocab_size=40000000>


In [12]:
bigram = Phraser(phrases)

INFO - 21:50:48: exporting phrases from Phrases<119165 vocab, min_count=2, threshold=10.0, max_vocab_size=40000000>
INFO - 21:50:48: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<0 phrases, min_count=2, threshold=10.0> from Phrases<119165 vocab, min_count=2, threshold=10.0, max_vocab_size=40000000> in 0.04s', 'datetime': '2021-11-14T21:50:48.980339', 'gensim': '4.0.1', 'python': '3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [13]:
sentences = bigram[df["Titles"]]

In [14]:
#building the model

import multiprocessing
from gensim.models import Word2Vec
from gensim.models import Doc2Vec

In [15]:
#parameters for the model
model = Word2Vec(min_count=1, 
                 window=20, 
                 vector_size=250, 
                 sample=0, 
                 sg=0, 
                 alpha=0.03, 
                 min_alpha=0.007, 
                 negative=5, 
                 workers=4)

INFO - 21:50:49: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=250, alpha=0.03)', 'datetime': '2021-11-14T21:50:49.020341', 'gensim': '4.0.1', 'python': '3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [16]:
#building vocabulary from dataset

t=time()
model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:50:49: collecting all words and their counts
INFO - 21:50:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:50:49: PROGRESS: at sentence #10000, processed 10000 words, keeping 4223 word types
INFO - 21:50:49: PROGRESS: at sentence #20000, processed 20000 words, keeping 8966 word types
INFO - 21:50:49: PROGRESS: at sentence #30000, processed 30000 words, keeping 14581 word types
INFO - 21:50:49: PROGRESS: at sentence #40000, processed 40000 words, keeping 18938 word types
INFO - 21:50:49: PROGRESS: at sentence #50000, processed 50000 words, keeping 23796 word types
INFO - 21:50:49: PROGRESS: at sentence #60000, processed 60000 words, keeping 29120 word types
INFO - 21:50:49: PROGRESS: at sentence #70000, processed 70000 words, keeping 33976 word types
INFO - 21:50:49: PROGRESS: at sentence #80000, processed 80000 words, keeping 39233 word types
INFO - 21:50:49: PROGRESS: at sentence #90000, processed 90000 words, keeping 43985 word types
INFO - 21

Time to build vocab: 0.05 mins


In [17]:
model.corpus_count

298396

In [18]:
#train the model

t = time()
model.train(sentences, total_examples=model.corpus_count, epochs=100)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:50:51: Word2Vec lifecycle event {'msg': 'training model with 4 workers on 119165 vocabulary and 250 features, using sg=0 hs=0 sample=0 negative=5 window=20', 'datetime': '2021-11-14T21:50:51.892344', 'gensim': '4.0.1', 'python': '3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 21:50:53: EPOCH 1 - PROGRESS: at 63.67% examples, 189497 words/s, in_qsize 5, out_qsize 0
INFO - 21:50:53: worker thread finished; awaiting finish of 3 more threads
INFO - 21:50:53: worker thread finished; awaiting finish of 2 more threads
INFO - 21:50:53: worker thread finished; awaiting finish of 1 more threads
INFO - 21:50:53: worker thread finished; awaiting finish of 0 more threads
INFO - 21:50:53: EPOCH - 1 : training on 298396 raw words (298396 effective words) took 1.2s, 251453 effective words/s
INFO - 21:50:54: EPOCH 2 - PROGRESS: at 67.03% examples, 193337 words/s, in_qsize 7, out_qsize 0
I

INFO - 21:51:09: EPOCH 16 - PROGRESS: at 93.84% examples, 278238 words/s, in_qsize 2, out_qsize 1
INFO - 21:51:09: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:09: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:09: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:09: EPOCH - 16 : training on 298396 raw words (298396 effective words) took 1.0s, 290968 effective words/s
INFO - 21:51:11: EPOCH 17 - PROGRESS: at 60.32% examples, 176680 words/s, in_qsize 5, out_qsize 2
INFO - 21:51:11: worker thread finished; awaiting finish of 3 more threads
INFO - 21:51:11: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:11: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:11: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:11: EPOCH - 17 : training on 298396 raw words (298396 effective words) took 1.1s, 260435 effective words/s
INFO - 21:51:12: EPOCH 18 - PROGRESS: a

INFO - 21:51:27: EPOCH 32 - PROGRESS: at 70.38% examples, 209417 words/s, in_qsize 7, out_qsize 0
INFO - 21:51:28: worker thread finished; awaiting finish of 3 more threads
INFO - 21:51:28: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:28: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:28: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:28: EPOCH - 32 : training on 298396 raw words (298396 effective words) took 1.1s, 272627 effective words/s
INFO - 21:51:29: EPOCH 33 - PROGRESS: at 77.08% examples, 228868 words/s, in_qsize 7, out_qsize 0
INFO - 21:51:29: worker thread finished; awaiting finish of 3 more threads
INFO - 21:51:29: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:29: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:29: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:29: EPOCH - 33 : training on 298396 raw words (298396 effective words) 

INFO - 21:51:47: EPOCH 48 - PROGRESS: at 50.27% examples, 148190 words/s, in_qsize 8, out_qsize 0
INFO - 21:51:47: worker thread finished; awaiting finish of 3 more threads
INFO - 21:51:47: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:47: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:47: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:47: EPOCH - 48 : training on 298396 raw words (298396 effective words) took 1.3s, 234628 effective words/s
INFO - 21:51:48: EPOCH 49 - PROGRESS: at 46.92% examples, 135958 words/s, in_qsize 6, out_qsize 1
INFO - 21:51:48: worker thread finished; awaiting finish of 3 more threads
INFO - 21:51:48: worker thread finished; awaiting finish of 2 more threads
INFO - 21:51:48: worker thread finished; awaiting finish of 1 more threads
INFO - 21:51:48: worker thread finished; awaiting finish of 0 more threads
INFO - 21:51:48: EPOCH - 49 : training on 298396 raw words (298396 effective words) 

INFO - 21:52:06: EPOCH 64 - PROGRESS: at 63.67% examples, 185762 words/s, in_qsize 5, out_qsize 2
INFO - 21:52:06: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:06: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:06: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:06: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:06: EPOCH - 64 : training on 298396 raw words (298396 effective words) took 1.1s, 268519 effective words/s
INFO - 21:52:07: EPOCH 65 - PROGRESS: at 60.32% examples, 178219 words/s, in_qsize 6, out_qsize 1
INFO - 21:52:07: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:07: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:07: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:07: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:07: EPOCH - 65 : training on 298396 raw words (298396 effective words) 

INFO - 21:52:24: EPOCH 80 - PROGRESS: at 63.67% examples, 184197 words/s, in_qsize 7, out_qsize 0
INFO - 21:52:24: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:24: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:24: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:24: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:24: EPOCH - 80 : training on 298396 raw words (298396 effective words) took 1.1s, 265141 effective words/s
INFO - 21:52:25: EPOCH 81 - PROGRESS: at 63.67% examples, 183478 words/s, in_qsize 7, out_qsize 0
INFO - 21:52:25: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:25: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:25: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:25: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:25: EPOCH - 81 : training on 298396 raw words (298396 effective words) 

INFO - 21:52:42: EPOCH 96 - PROGRESS: at 70.38% examples, 202059 words/s, in_qsize 7, out_qsize 0
INFO - 21:52:42: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:42: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:42: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:42: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:42: EPOCH - 96 : training on 298396 raw words (298396 effective words) took 1.1s, 266942 effective words/s
INFO - 21:52:43: EPOCH 97 - PROGRESS: at 73.73% examples, 215543 words/s, in_qsize 7, out_qsize 0
INFO - 21:52:44: worker thread finished; awaiting finish of 3 more threads
INFO - 21:52:44: worker thread finished; awaiting finish of 2 more threads
INFO - 21:52:44: worker thread finished; awaiting finish of 1 more threads
INFO - 21:52:44: worker thread finished; awaiting finish of 0 more threads
INFO - 21:52:44: EPOCH - 97 : training on 298396 raw words (298396 effective words) 

Time to train the model: 1.93 mins


In [19]:
#well let's see how we did

model.wv.most_similar('software engineer', topn=10)

[('manager of lan architecture', 0.30757850408554077),
 ('senior product marketing manager - access routers', 0.28108468651771545),
 ('smarter social workforce', 0.2759867012500763),
 ('prinipal instructor', 0.2589014172554016),
 ('customer portal engagement manager', 0.2554255723953247),
 ('digital logic design engineer', 0.2500015199184418),
 ('mba intern strategy and operations', 0.24334444105625153),
 ('ntac engineer', 0.24150025844573975),
 ('pre-sales consulting manager - federal', 0.23958902060985565),
 ('technical solutions leader - training and managed services',
  0.23958539962768555)]

In [20]:
model.wv.similarity('software developer', 'software engineer')

0.01651573