# Following this resource
https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb

# Initial Setup

## Imports

In [None]:
# Basics
import pandas as pd
import numpy as np

# Loading and saving data
import pickle

# Progress
from tqdm import tqdm

# Set random seed
np.random.seed(42)

# Plotting
import plotly.express as px

# Latent Dirichlet Allocation
from gensim.models import LdaModel, CoherenceModel

# Build corpus
from gensim import corpora

# Saving Models
from gensim.test.utils import datapath

# Combinations
from itertools import combinations

## Loading Models

In [None]:
num_topics = 20
def get_model(num_topics):
    file_path = f'../model/LDA-{num_topics}topics'
    lda_model = LdaModel.load(file_path)
    return lda_model

lda_model = get_model(20)

In [None]:
def get_topics(lda_model):
    topics = lda_model.show_topics(num_topics = -1, num_words=20, formatted=False)
    return topics

topics = get_topics(lda_model)
topic = topics[0]
topic

## Jaccard Similarity

In [None]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    similarity = len(intersection)/len(union)
    return similarity

test_set1 = {1, 2, 3}
test_set2 = {2, 3, 4}
assert jaccard_similarity(test_set1, test_set2) == 1/2, "Should return 1/2."

In [None]:
def topic_word_set(topic):
    word_tuple_list = topic[1]
    word_set = {word_tuple[0] for word_tuple in word_tuple_list}
    return word_set

word_set = topic_word_set(topic)
word_set    

In [None]:
def mean_jaccard_similarity(topics):
    N = len(topics)
    similarity_list = []
    combs = combinations(topics, 2)
    for topic1, topic2 in combs:
        set1 = topic_word_set(topic1)
        set2 = topic_word_set(topic2)
        similarity_list.append(jaccard_similarity(set1, set2))
    mean_similarity = np.mean(similarity_list)
    return mean_similarity

mean_jaccard_similarity(topics)

## Get Corpus

In [None]:
with open('../data/processed_data.pkl', mode='rb') as file:
    data_records = pickle.load(file)

tokens = [post['description_tokens'] for post in data_records]
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in tokens]

## Coherence Score

In [None]:
def get_coherence(model, texts, dictionary):
    coherence_model = CoherenceModel(
        model=model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

This function takes several minutes to run. We have pickled the output.

```python
def get_metrics(texts, dictionary):
    metrics_list = {
        'n': [],
        'mean_jaccard': [],
        'coherence': []        
    }
    for n in tqdm(range(2,31)):
        model = get_model(n)
        topics = get_topics(model)
        metrics_list['n'].append(n)
        metrics_list['mean_jaccard'].append(mean_jaccard_similarity(topics)),
        metrics_list['coherence'].append(get_coherence(model, texts, dictionary))
    return metrics_list

metrics_list = get_metrics(tokens, dictionary_LDA)
with open('../model/metrics.pkl', mode='wb') as file:
    pickle.dump(metrics_list, file)
```

In [None]:
with open('../model/metrics.pkl', mode='rb') as file:
    metrics_list = pickle.load(file)

## Plotting Metrics

In [None]:
metrics_df = pd.DataFrame(metrics_list)
metrics_df.head()

In [None]:
px.line(
    data_frame = metrics_df,
    x = 'n',
    y = ['mean_jaccard', 'coherence'],
    title = 'latent dirichlet allocation topic number selection'.title(),
    labels = {
        'n': 'Number of Topics',
        'variable': 'Metric'
    }
)

Based on the plot above, we select 24 topics for our model. It might be advisable to investigate models with more than 30 topics as both coherence and mean Jaccard similarity are still improving slowly.