# Topic Modeling (Gensim)

In [None]:
import pandas as pd
import ast  # Used to convert strings to lists
from gensim import corpora
from gensim.models import LdaModel
import numpy as np
import matplotlib.pyplot as plt

# Read CSV file
df = pd.read_csv("normalized_data.csv")

# Extract the 'normalization' column and convert it to list format
texts = [ast.literal_eval(row) for row in df['normalization']]

In [None]:
# build the dictionary and convert the documents 
texts_dictionary = corpora.Dictionary(texts)
# to bag of words (bow) representation using the dictionary
texts_corpus = [texts_dictionary.doc2bow(text) for text in texts]

# train the model
# the more iteration, the more stable the model
# Set training parameters.
num_topics = 20
chunksize = 3000
passes = 10
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

np.random.seed(432)

topics_model = LdaModel(
    texts_corpus,
    id2word = texts_dictionary,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
)

In [None]:
# Get the topic distribution for each document
corpus_lda = topics_model[texts_corpus]  # Use the trained LDA model
thetas = [topics_model[c] for c in texts_corpus]  # Retrieve the topic distribution for each document

# Calculate the number of topics per document (considering only topics with probability greater than 0.06)
hist_data = [len([topic for topic in t if topic[1] > 0.06]) for t in thetas]

# Plot a histogram of topic counts
counts, bins = np.histogram(hist_data, bins=np.arange(1, num_topics + 2) - 0.5)

# Use rainbow colors for the bars
colors = plt.cm.rainbow(np.linspace(0, 1, len(counts)))

for x in range(len(counts)):
    plt.bar(x + 1, counts[x], color=colors[x], edgecolor='black', width=0.8)

plt.xlabel('Number of Topics per Tweet')
plt.ylabel('Number of Tweets')
plt.title('Distribution of Topic Counts per Tweet')
plt.xticks(range(1, num_topics + 1))
plt.show()

In [None]:
topics_model.print_topics()

In [None]:
from gensim.models import CoherenceModel# spaCy for preprocessing
# Compute Perplexity
print('\nPerplexity: ', topics_model.log_perplexity(texts_corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=topics_model, texts=texts, dictionary=texts_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
import pyLDAvis
import pyLDAvis.gensim

# Visualize topics
pyLDAvis.enable_notebook() 
vis = pyLDAvis.gensim.prepare(topics_model, texts_corpus, texts_dictionary) 
vis