# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [59]:
# Basics
import pandas as pd
import numpy as np

# Progress
from tqdm import tqdm

# Set random seed
np.random.seed(42)

# Plotting
import plotly.express as px

# Latent Dirichlet Allocation
from gensim.models import LdaModel, CoherenceModel

# Saving Models
from gensim.test.utils import datapath

from itertools import combinations

## Loading Models

In [26]:
num_topics = 20
def get_model(num_topics):
    file_path = f'../model/LDA-{num_topics}topics'
    lda_model = LdaModel.load(file_path)
    return lda_model

lda_model = get_model(20)

In [40]:
def get_topics(lda_model):
    topics = lda_model.show_topics(num_topics = -1, num_words=20, formatted=False)
    return topics

topics = get_topics(lda_model)
topic = topics[0]
topic

(0,
 [('aws', 0.02549657),
  ('customer', 0.014029006),
  ('knowledge', 0.01398717),
  ('good', 0.011425586),
  ('big', 0.010933044),
  ('google_cloud_platform', 0.0104406485),
  ('process', 0.009689362),
  ('dice', 0.007920467),
  ('client', 0.0077846665),
  ('quality', 0.0077294945),
  ('project', 0.0072583375),
  ('must', 0.0072508124),
  ('position', 0.0071388446),
  ('engagement', 0.006872587),
  ('complex', 0.006436063),
  ('change', 0.006354013),
  ('application', 0.006306876),
  ('cloud', 0.006173741),
  ('job', 0.0060890415),
  ('apply', 0.005556921)])

## Jaccard Similarity

In [44]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    similarity = len(intersection)/len(union)
    return similarity

test_set1 = {1, 2, 3}
test_set2 = {2, 3, 4}
assert jaccard_similarity(test_set1, test_set2) == 1/2, "Should return 1/2."

In [45]:
def topic_word_set(topic):
    word_tuple_list = topic[1]
    word_set = {word_tuple[0] for word_tuple in word_tuple_list}
    return word_set

word_set = topic_word_set(topic)
word_set    

{'application',
 'apply',
 'aws',
 'big',
 'change',
 'client',
 'cloud',
 'complex',
 'customer',
 'dice',
 'engagement',
 'good',
 'google_cloud_platform',
 'job',
 'knowledge',
 'must',
 'position',
 'process',
 'project',
 'quality'}

In [54]:
def mean_jaccard_similarity(topics):
    N = len(topics)
    similarity_list = []
    combs = combinations(topics, 2)
    for topic1, topic2 in combs:
        set1 = topic_word_set(topic1)
        set2 = topic_word_set(topic2)
        similarity_list.append(jaccard_similarity(set1, set2))
    mean_similarity = np.mean(similarity_list)
    return mean_similarity

mean_jaccard_similarity(topics)

0.12096222161701466

## Get Corpus

In [56]:
with open('../data/processed_data.pkl', mode='rb') as file:
    data_records = pickle.load(file)

tokens = [post['description_tokens'] for post in data_records]
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in tokens]

## Coherence Score

In [67]:
def get_coherence(model, texts, dictionary):
    coherence_model = CoherenceModel(
        model=model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

coherence = get_coherence(lda_model, tokens, dictionary_LDA)
coherence

0.3601859814829399

In [None]:
def get_metrics(texts, dictionary):
    metrics_list = {
        'n': [],
        'mean_jaccard': [],
        'coherence': []        
    }
    for n in tqdm(range(2,31)):
        model = get_model(n)
        topics = get_topics(model)
        metrics_list['n'].append(n)
        metrics_list['mean_jaccard'].append(mean_jaccard_similarity(topics)),
        metrics_list['coherence'].append(get_coherence(model, texts, dictionary))
    return metrics_list

metrics_list = get_metrics(tokens, dictionary_LDA)

 76%|███████▌  | 22/29 [28:59<11:31, 98.78s/it]

## Inspecting Topics

In [4]:
for i,topic in lda_model.show_topics(formatted=True,
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.025*"aws" + 0.014*"customer" + 0.014*"knowledge" + 0.011*"good" + 0.011*"big" + 0.010*"google_cloud_platform" + 0.010*"process" + 0.008*"dice" + 0.008*"client" + 0.008*"quality" + 0.007*"project" + 0.007*"must" + 0.007*"position" + 0.007*"engagement" + 0.006*"complex" + 0.006*"change" + 0.006*"application" + 0.006*"cloud" + 0.006*"job" + 0.006*"apply"

1: 0.027*"science" + 0.017*"model" + 0.015*"analytics" + 0.012*"technology" + 0.011*"client" + 0.010*"scientist" + 0.010*"analysis" + 0.009*"technique" + 0.007*"help" + 0.007*"apply" + 0.006*"tool" + 0.006*"lead" + 0.006*"project" + 0.006*"knowledge" + 0.006*"professional" + 0.006*"machine_learning" + 0.006*"algorithm" + 0.006*"analytical" + 0.006*"opportunity" + 0.005*"information"

2: 0.028*"product" + 0.019*"engineering" + 0.019*"build" + 0.019*"customer" + 0.017*"technology" + 0.015*"platform" + 0.014*"engineer" + 0.013*"design" + 0.013*"technical" + 0.009*"service" + 0.009*"big" + 0.009*"analytics" + 0.009*"software" + 0.009*"c

In [None]:
lda_model[corpus[0]]

In [None]:
# https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf
# Here a short legend to explain the vis:
# size of bubble: proportional to the proportions of the topics across the
# N total tokens in the corpus
# red bars: estimated number of times a given term was generated by a given topic
# blue bars: overall frequency of each term in the corpus
# -- Relevance of words is computed with a parameter lambda
# -- Lambda optimal value ~0.6 
# (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf)
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(
    topic_model=lda_model, 
    corpus=corpus, 
    dictionary=dictionary_LDA
)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)