# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [1]:
# Basics
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

# Plotting
import plotly.express as px

# Database
from JobsDb import JobsDb

# Data Processing
from DataProcessor import data_processor

# Build corpus
from gensim import corpora

# Latent Dirichlet Allocation
from gensim import models


## Loading the Data

In [2]:
db = JobsDb()
df = db.load_table_as_df('jobs')
db.close()
df = df.iloc[9680:]
data = df.copy()
data = data.reset_index().drop(['id', 'index'], axis=1)
print(df.shape)
data.head()

(9485, 4)


Unnamed: 0,title,url,description
0,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
1,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
2,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
3,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
4,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


## Extracting Documents

In [3]:
docs = list(data['description'])
doc = docs[0]
doc

'\\nCompany Overview  Come join a Winning Team! Since 1970, Plastic Express has been leading the bulk trucking, bulk terminal, packaging, and warehousing needs of the plastics industry. Our strategic locations, modern systems, and dedicated employees allow us to provide custom tailored logistical solutions to fulfill the most challenging needs of our customers. Plastic Express operates from 15 warehouse locations and 37 rail terminals across the US. At many of the Plastic Express sites, we also handle some non-plastic commodities, which include; paper rolls, steel, building materials and other dry bulk materials. Plastic Express owns and operates roughly 130 trucks, with approximately 200 trailers performing full bulk truck distribution business. Plastic Express is headquartered in City of Industry, CA and has over 300 employees nationwide. Our goal has always been to exceed our customers’ expectations, and our “Can Do” attitude is what differentiates us from the competition.  Position

## Processing Data

In [None]:
processed_data = data_processor(docs)

In [None]:
dictionary_LDA = corpora.Dictionary(processed_docs)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in processed_docs]

## Fit Model

In [None]:
np.random.seed(123456)
num_topics = 2
alpha = [0.01]*num_topics
eta = [0.01]*len(dictionary_LDA.keys())
lda_model = models.LdaModel(
    corpus, 
    num_topics=num_topics,
    id2word=dictionary_LDA,
    passes=4, 
    alpha=alpha,
    eta=eta
)
lda_model.save(f'../model/lda-{num_topics}topics')

## Inspecting Topics

In [None]:
for i,topic in lda_model.show_topics(formatted=True,
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

In [None]:
lda_model[corpus[0]]

In [None]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the
# N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 
# (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(
    topic_model=lda_model, 
    corpus=corpus, 
    dictionary=dictionary_LDA
)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)