# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [1]:
# Basics
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

# Timing code execution
from tqdm import tqdm

# Flatten nested objects quickly
from itertools import chain

# Plotting
import plotly.express as px

# Database
from JobsDb import JobsDb

# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# Stopwords
from nltk.corpus import stopwords

# Parts of Speach Tagging
from nltk import pos_tag

# Lemmatization
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

# n-grams
from gensim.models import Phrases

# Latent Dirichlet Allocation
from gensim import models


## Loading the Data

In [85]:
db = JobsDb()
df = db.load_table_as_df('jobs')
tqdm.pandas()
db.close()
df = df.iloc[9680:]
data = df.copy()
data = data.reset_index().drop(['id', 'index'], axis=1)
print(df.shape)
data.head()

(9485, 4)


Unnamed: 0,title,url,description
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


In [87]:
descriptions = data['description']

In [102]:
def description_tokenizer(description):
    sentences = sent_tokenize(description)
    words = [word_tokenize(sentence) for sentence in sentences]
    return words

In [103]:
%time descriptions.map(description_tokenizer)

CPU times: user 1min 13s, sys: 279 ms, total: 1min 13s
Wall time: 1min 13s


0       [[\nCompany, Overview, Come, join, a, Winning,...
1       [[\n, prepare, ,, compile, and, sort, document...
2       [[\n, \n, Data, Scientist, is, responsible, fo...
3       [[\n, \n, Title, :, Provider, Data, Specialist...
4       [[\nOur, Mission, At, Dobbs, Defense, ,, we, d...
                              ...                        
9480    [[\n, \n, This, opportunity, is, with, a, lead...
9481    [[\nOverview, Overview, :, This, position, is,...
9482    [[\n, \n, Please, note, division, and, functio...
9483    [[\n, \n, About, Mux, Mux, is, video, for, dev...
9484    [[\n, \n, Mercari, is, the, selling, app, .], ...
Name: description, Length: 9485, dtype: object

## Tokenize Data

In [65]:
data['sentences'] = data['description'].progress_map(sent_tokenize)
data.head()

100%|██████████| 9485/9485 [00:13<00:00, 698.03it/s]


Unnamed: 0,title,url,description,sentences
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...,[\nCompany Overview Come join a Winning Team!...
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat...","[\n prepare, compile and sort documents for da..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...,[\n \n Data Scientist is responsible for c...
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...,[\n \n Title: Provider Data Specialist Lo...
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi...","[\nOur Mission At Dobbs Defense, we deliver m..."


In [66]:
data['words'] = data['sentences'].progress_map(
    lambda sentences: [word_tokenize(sentence) for sentence in sentences]
)
data.head()

100%|██████████| 9485/9485 [01:08<00:00, 139.28it/s]


Unnamed: 0,title,url,description,sentences,words
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...,[\nCompany Overview Come join a Winning Team!...,"[[\nCompany, Overview, Come, join, a, Winning,..."
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat...","[\n prepare, compile and sort documents for da...","[[\n, prepare, ,, compile, and, sort, document..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...,[\n \n Data Scientist is responsible for c...,"[[\n, \n, Data, Scientist, is, responsible, fo..."
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...,[\n \n Title: Provider Data Specialist Lo...,"[[\n, \n, Title, :, Provider, Data, Specialist..."
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi...","[\nOur Mission At Dobbs Defense, we deliver m...","[[\nOur, Mission, At, Dobbs, Defense, ,, we, d..."


## Lemmatization

In [67]:
data['POS_tokens'] = data['words'].progress_map(
    lambda words: [pos_tag(word) for word in words]
)
print(data['POS_tokens'].head(1).tolist()[0][:3])

100%|██████████| 9485/9485 [06:36<00:00, 23.91it/s]

[[('\\nCompany', 'JJ'), ('Overview', 'NNP'), ('Come', 'NNP'), ('join', 'NN'), ('a', 'DT'), ('Winning', 'NNP'), ('Team', 'NN'), ('!', '.')], [('Since', 'IN'), ('1970', 'CD'), (',', ','), ('Plastic', 'NNP'), ('Express', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('leading', 'VBG'), ('the', 'DT'), ('bulk', 'NN'), ('trucking', 'NN'), (',', ','), ('bulk', 'JJ'), ('terminal', 'NN'), (',', ','), ('packaging', 'NN'), (',', ','), ('and', 'CC'), ('warehousing', 'VBG'), ('needs', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('plastics', 'NNS'), ('industry', 'NN'), ('.', '.')], [('Our', 'PRP$'), ('strategic', 'JJ'), ('locations', 'NNS'), (',', ','), ('modern', 'JJ'), ('systems', 'NNS'), (',', ','), ('and', 'CC'), ('dedicated', 'VBD'), ('employees', 'NNS'), ('allow', 'VBP'), ('us', 'PRP'), ('to', 'TO'), ('provide', 'VB'), ('custom', 'NN'), ('tailored', 'JJ'), ('logistical', 'JJ'), ('solutions', 'NNS'), ('to', 'TO'), ('fulfill', 'VB'), ('the', 'DT'), ('most', 'RBS'), ('challenging', 'JJ'), ('needs', 'NNS'), (




In [68]:
# Inspired from https://stackoverflow.com/a/15590384
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

lemmatizer = WordNetLemmatizer()

In [69]:
# Lemmatizing each word with its POS tag, in each sentence
data['lemmatized_words'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)
data['lemmatized_words'].head(1).tolist()[0][:3]

100%|██████████| 9485/9485 [00:28<00:00, 336.40it/s]


[['\\nCompany', 'Overview', 'Come', 'join', 'a', 'Winning', 'Team', '!'],
 ['Since',
  '1970',
  ',',
  'Plastic',
  'Express',
  'have',
  'be',
  'lead',
  'the',
  'bulk',
  'trucking',
  ',',
  'bulk',
  'terminal',
  ',',
  'packaging',
  ',',
  'and',
  'warehouse',
  'need',
  'of',
  'the',
  'plastic',
  'industry',
  '.'],
 ['Our',
  'strategic',
  'location',
  ',',
  'modern',
  'system',
  ',',
  'and',
  'dedicate',
  'employee',
  'allow',
  'us',
  'to',
  'provide',
  'custom',
  'tailored',
  'logistical',
  'solution',
  'to',
  'fulfill',
  'the',
  'most',
  'challenging',
  'need',
  'of',
  'our',
  'customer',
  '.']]

In [70]:
## combine sentences and drop stopwords

In [71]:
stopwords_other = []
my_stopwords = stopwords.words('english') + stopwords_other

In [72]:
data['tokens'] = data['lemmatized_words'].progress_map(
    lambda sentences: list(chain.from_iterable(sentences))
)
data['tokens'] = data['tokens'].progress_map(
    lambda tokens: [
        token.lower() for token in tokens if token.isalpha() 
        and token.lower() not in my_stopwords and len(token)>1
    ]
)

100%|██████████| 9485/9485 [00:00<00:00, 24061.53it/s]
100%|██████████| 9485/9485 [00:16<00:00, 558.58it/s]


In [73]:
data['tokens'].head(1).tolist()[0]

['overview',
 'come',
 'join',
 'winning',
 'team',
 'since',
 'plastic',
 'express',
 'lead',
 'bulk',
 'trucking',
 'bulk',
 'terminal',
 'packaging',
 'warehouse',
 'need',
 'plastic',
 'industry',
 'strategic',
 'location',
 'modern',
 'system',
 'dedicate',
 'employee',
 'allow',
 'us',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'fulfill',
 'challenging',
 'need',
 'customer',
 'plastic',
 'express',
 'operate',
 'warehouse',
 'location',
 'rail',
 'terminal',
 'across',
 'us',
 'many',
 'plastic',
 'express',
 'site',
 'also',
 'handle',
 'commodity',
 'include',
 'paper',
 'roll',
 'steel',
 'building',
 'material',
 'dry',
 'bulk',
 'material',
 'plastic',
 'express',
 'operate',
 'roughly',
 'truck',
 'approximately',
 'trailer',
 'perform',
 'full',
 'bulk',
 'truck',
 'distribution',
 'business',
 'plastic',
 'express',
 'headquarter',
 'city',
 'industry',
 'ca',
 'employee',
 'nationwide',
 'goal',
 'always',
 'exceed',
 'customer',
 'expectation',
 '

## Make Bigrams and Trigrams

In [77]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])
data['tokens'] = tokens

In [78]:
data['tokens'].head(1).tolist()[0]

['overview',
 'come_join',
 'winning',
 'team',
 'since',
 'plastic_express',
 'lead',
 'bulk',
 'trucking',
 'bulk',
 'terminal',
 'packaging',
 'warehouse',
 'need',
 'plastic',
 'industry',
 'strategic',
 'location',
 'modern',
 'system',
 'dedicate',
 'employee',
 'allow_us',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'fulfill',
 'challenging',
 'need',
 'customer',
 'plastic_express',
 'operate',
 'warehouse',
 'location',
 'rail',
 'terminal',
 'across',
 'us',
 'many',
 'plastic_express',
 'site',
 'also',
 'handle',
 'commodity',
 'include',
 'paper',
 'roll',
 'steel',
 'building',
 'material',
 'dry',
 'bulk',
 'material',
 'plastic_express',
 'operate',
 'roughly',
 'truck',
 'approximately',
 'trailer',
 'perform',
 'full',
 'bulk',
 'truck',
 'distribution',
 'business',
 'plastic_express',
 'headquarter',
 'city',
 'industry',
 'ca',
 'employee',
 'nationwide',
 'goal',
 'always',
 'exceed',
 'customer',
 'expectation',
 'attitude',
 'differentiate_u

# Save Processed Data

In [79]:
data.to_csv('../data/processed_data.csv')

## Make token dictionary and corpus

In [75]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## Fit Model

In [83]:
np.random.seed(123456)
num_topics = 2
alpha = [0.01]*num_topics
eta = [0.01]*len(dictionary_LDA.keys())
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=alpha, \
                                  eta=eta)
lda_model.save(f'../model/lda-{num_topics}topics')

CPU times: user 2min 54s, sys: 4.82 s, total: 2min 59s
Wall time: 1min 32s


## Inspecting Topics

In [51]:
for i,topic in lda_model.show_topics(formatted=True,
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.019*"analysis" + 0.012*"research" + 0.011*"report" + 0.010*"program" + 0.010*"information" + 0.008*"position" + 0.008*"ability" + 0.007*"analyst" + 0.007*"health" + 0.007*"knowledge" + 0.007*"tool" + 0.007*"system" + 0.007*"application" + 0.006*"require" + 0.005*"need" + 0.005*"technical" + 0.005*"reporting" + 0.005*"database" + 0.005*"project" + 0.005*"science"

1: 0.015*"technology" + 0.013*"engineering" + 0.011*"product" + 0.010*"solution" + 0.010*"platform" + 0.010*"build" + 0.009*"design" + 0.009*"software" + 0.008*"customer" + 0.008*"technical" + 0.008*"company" + 0.007*"service" + 0.007*"engineer" + 0.006*"role" + 0.006*"system" + 0.006*"lead" + 0.006*"help" + 0.006*"environment" + 0.005*"need" + 0.005*"knowledge"

2: 0.013*"aws" + 0.011*"technology" + 0.008*"technical" + 0.008*"big" + 0.008*"analysis" + 0.008*"role" + 0.008*"sql" + 0.008*"client" + 0.007*"solution" + 0.007*"ability" + 0.007*"etl" + 0.007*"require" + 0.006*"strong" + 0.006*"position" + 0.006*"analytics" + 0

In [33]:
lda_model

<gensim.models.ldamodel.LdaModel at 0x7fe5f86af6a0>

In [53]:
lda_model[corpus[0]]

[(1, 0.15950848),
 (4, 0.020949814),
 (7, 0.5539973),
 (14, 0.17682917),
 (16, 0.08773857)]

In [54]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the
# N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 
# (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(
    topic_model=lda_model, 
    corpus=corpus, 
    dictionary=dictionary_LDA
)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)