# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [11]:
# Basics
import pandas as pd
import numpy as np

# Progress
from tqdm import tqdm

# Set random seed
np.random.seed(42)

# Plotting
import plotly.express as px

# Database
import pickle

# Data Processing
from DataProcessor import data_processor, doc_processor

# Build corpus
from gensim import corpora

# Latent Dirichlet Allocation
from gensim import models

# Saving Models
from gensim.test.utils import datapath

## Loading the Data

In [3]:
with open('../data/processed_data.pkl', mode='rb') as file:
    data_records = pickle.load(file)

In [5]:
tokens = [post['description_tokens'] for post in data_records]

In [6]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in tokens]

## Fit Model

In [20]:
np.random.seed(42)
for num_topics in tqdm(range(3,31)):
    alpha = [0.01]*num_topics
    eta = [0.01]*len(dictionary_LDA.keys())
    lda_model = models.LdaModel(
        corpus, 
        num_topics=num_topics,
        id2word=dictionary_LDA,
        passes=4, 
        alpha=alpha,
        eta=eta
    )
    file_path = datapath(f'/home/schart/Flatiron/DataScience/Phase5/Assessments/JobDash/model/LDA-{num_topics}topics')
    lda_model.save(file_path)

100%|██████████| 28/28 [1:04:49<00:00, 138.90s/it]


In [25]:
num_topics = 20
file_path = datapath(f'/home/schart/Flatiron/DataScience/Phase5/Assessments/JobDash/model/LDA-{num_topics}topics')
lda_model = models.LdaModel.load(file_path)

## Inspecting Topics

In [26]:
for i,topic in lda_model.show_topics(formatted=True,
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.025*"aws" + 0.014*"customer" + 0.014*"knowledge" + 0.011*"good" + 0.011*"big" + 0.010*"google_cloud_platform" + 0.010*"process" + 0.008*"dice" + 0.008*"client" + 0.008*"quality" + 0.007*"project" + 0.007*"must" + 0.007*"position" + 0.007*"engagement" + 0.006*"complex" + 0.006*"change" + 0.006*"application" + 0.006*"cloud" + 0.006*"job" + 0.006*"apply"

1: 0.027*"science" + 0.017*"model" + 0.015*"analytics" + 0.012*"technology" + 0.011*"client" + 0.010*"scientist" + 0.010*"analysis" + 0.009*"technique" + 0.007*"help" + 0.007*"apply" + 0.006*"tool" + 0.006*"lead" + 0.006*"project" + 0.006*"knowledge" + 0.006*"professional" + 0.006*"machine_learning" + 0.006*"algorithm" + 0.006*"analytical" + 0.006*"opportunity" + 0.005*"information"

2: 0.028*"product" + 0.019*"engineering" + 0.019*"build" + 0.019*"customer" + 0.017*"technology" + 0.015*"platform" + 0.014*"engineer" + 0.013*"design" + 0.013*"technical" + 0.009*"service" + 0.009*"big" + 0.009*"analytics" + 0.009*"software" + 0.009*"c

In [27]:
lda_model[corpus[0]]

[(0, 0.01966356),
 (3, 0.13305639),
 (5, 0.25371185),
 (6, 0.055842802),
 (10, 0.5367034)]

In [28]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the
# N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 
# (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(
    topic_model=lda_model, 
    corpus=corpus, 
    dictionary=dictionary_LDA
)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)