# Latent Dirichlet Allocation Model on 1-gram Tokens

In [None]:
import sys
sys.path.append(str(Path.cwd().parent)) # needs to add project root to sys.path to import from src module
import pandas as pd
from pathlib import Path
import gensim.corpora as corpora
from gensim import models
from gensim.models import CoherenceModel
from pprint import pprint
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pyLDAvis.gensim
from src.modeling.topic_modeling import lda_compute_coherence_values

### Load dataset

In [1]:
path_to_data = Path('../data')
df = pd.read_csv(path_to_data / 'processed' /'tokenized1gram_data.csv', converters={'tokenized': eval, 'tokenized_no_single': eval})

### Create gensim dict and corpus

In [2]:
doc_list = list(df['tokenized_no_single'])
dictionary = corpora.Dictionary(doc_list)

corpus = [dictionary.doc2bow(doc) for doc in doc_list]

tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

### Fit a set of models with varying num of topics

In [3]:
model_list, coherence_values = lda_compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf, texts=doc_list, start=2, limit=20, step=1)

### Evaluate models

In [None]:
# Show graph
limit=20; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

### Visualize model with highest coherence

In [None]:
lda_display = pyLDAvis.gensim.prepare(model_list[np.argmax(coherence_values)], corpus_tfidf, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)