## References

In [None]:
# https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html
# https://medium.com/square-corner-blog/topic-modeling-optimizing-for-human-interpretability-48a81f6ce0ed

## Notebook Setup

In [None]:
# Import libraries
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
from gensim import corpora, models
from gensim.models import HdpModel, LsiModel, CoherenceModel

In [None]:
# Log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Training Corpora and Dictionaries

In [None]:
# Load no pooling corpus
if (os.path.exists("../outputs/tourism_no_pooling.dict")):
   dictionary_no_pooling = corpora.Dictionary.load("../outputs/tourism_no_pooling.dict")
   corpus_no_pooling = corpora.MmCorpus("../outputs/tourism_no_pooling.mm")
   print("Vectorized no pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# Load user pooling corpus
if (os.path.exists("../outputs/tourism_user_pooling.dict")):
   dictionary_user_pooling = corpora.Dictionary.load("../outputs/tourism_user_pooling.dict")
   corpus_user_pooling = corpora.MmCorpus("../outputs/tourism_user_pooling.mm")
   print("Vectorized user pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# Load hashtag pooling corpus
if (os.path.exists("../outputs/tourism_hashtag_pooling.dict")):
   dictionary_hashtag_pooling = corpora.Dictionary.load("../outputs/tourism_hashtag_pooling.dict")
   corpus_hashtag_pooling = corpora.MmCorpus("../outputs/tourism_hashtag_pooling.mm")
   print("Vectorized hashtag pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

## Load Documents

In [None]:
# Load no pooling documents
with open ("../outputs/tokenized_documents_no_pooling.p", "rb") as fp:
    tokenized_documents_no_pooling = pickle.load(fp)

# Load user pooling documents
with open("../outputs/tokenized_documents_user_pooling.p", "rb") as fp:
    tokenized_documents_user_pooling = pickle.load(fp)

# Load hashtag pooling documents
with open("../outputs/tokenized_documents_hashtag_pooling.p", "rb") as fp:
    tokenized_documents_hashtag_pooling = pickle.load(fp)

## Load LDA Models (Trained in 02 Notebook)

In [None]:
# Load models
lda_model_no_pooling = models.LdaModel.load("../outputs/lda_model_no_pooling.model") # 6 topics
lda_model_user_pooling = models.LdaModel.load("../outputs/lda_model_user_pooling.model") # 7 topics
lda_model_hashtag_pooling = models.LdaModel.load("../outputs/lda_model_hashtag_pooling.model") # 7 topics

In [None]:
# Extract topics with word probabilities
lda_topics_no_pooling = lda_model_no_pooling.show_topics(formatted=False)
lda_topics_user_pooling = lda_model_user_pooling.show_topics(formatted=False)
lda_topics_hashtag_pooling = lda_model_hashtag_pooling.show_topics(formatted=False)

## Train HDP Models

In [None]:
# Train HDP models with different pooling methods (similar to LDA in 02 notebook)
hdp_model_no_pooling = HdpModel(corpus_no_pooling, dictionary_no_pooling)
hdp_model_user_pooling = HdpModel(corpus_user_pooling, dictionary_user_pooling)
hdp_model_hashtag_pooling = HdpModel(corpus_hashtag_pooling, dictionary_hashtag_pooling)

In [None]:
# Extract topics with word probabilities
hdp_topics_no_pooling = hdp_model_no_pooling.show_topics(formatted=False)
hdp_topics_user_pooling = hdp_model_user_pooling.show_topics(formatted=False)
hdp_topics_hashtag_pooling = hdp_model_hashtag_pooling.show_topics(formatted=False)

## Train LSI Models

In [None]:
# Train LSI models with similar configurations as LDA models
lsi_model_no_pooling = LsiModel(corpus=corpus_no_pooling, num_topics=6, id2word=dictionary_no_pooling)
lsi_model_user_pooling = LsiModel(corpus=corpus_user_pooling, num_topics=7, id2word=dictionary_user_pooling)
lsi_model_hashtag_pooling = LsiModel(corpus=corpus_hashtag_pooling, num_topics=7, id2word=dictionary_hashtag_pooling)

In [None]:
# Extract topics with word probabilities
lsi_topics_no_pooling = lsi_model_no_pooling.show_topics(formatted=False)
lsi_topics_user_pooling = lsi_model_user_pooling.show_topics(formatted=False)
lsi_topics_hashtag_pooling = lsi_model_hashtag_pooling.show_topics(formatted=False)

## Calculate Coherence Values

In [None]:
# Extract top topic words
lsi_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_no_pooling]
lsi_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_user_pooling]
lsi_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_hashtag_pooling]

hdp_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_no_pooling]
hdp_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_user_pooling]
hdp_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_hashtag_pooling]

lda_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_no_pooling]
lda_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_user_pooling]
lda_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_hashtag_pooling]

In [None]:
# Calculate coherences
lsi_no_pooling_coherence = CoherenceModel(topics=lsi_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
lsi_user_pooling_coherence = CoherenceModel(topics=lsi_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
lsi_hashtag_pooling_coherence = CoherenceModel(topics=lsi_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

hdp_no_pooling_coherence = CoherenceModel(topics=hdp_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
hdp_user_pooling_coherence = CoherenceModel(topics=hdp_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
hdp_hashtag_pooling_coherence = CoherenceModel(topics=hdp_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

lda_no_pooling_coherence = CoherenceModel(topics=lda_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
lda_user_pooling_coherence = CoherenceModel(topics=lda_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
lda_hashtag_pooling_coherence = CoherenceModel(topics=lda_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

In [None]:
# Define function to plot a graph to compare coherence scores
def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align="center", color=["red","red","red","green","green","green","blue","blue","blue"])
    plt.xlabel("Models")
    plt.ylabel("Coherence Value")

In [None]:
# Compare coherence scores
evaluate_bar_graph([lsi_no_pooling_coherence, lsi_user_pooling_coherence, lsi_hashtag_pooling_coherence,
                    hdp_no_pooling_coherence, hdp_user_pooling_coherence, hdp_hashtag_pooling_coherence,
                    lda_no_pooling_coherence, lda_user_pooling_coherence, lda_hashtag_pooling_coherence],
                   ["LSI1", "LSI2", "LSI3", "HDP1", "HDP2",
                    "HDP3", "LDA1", "LDA2", "LDA3"])
plt.savefig("topic_model_comparison")
plt.show()

The HDP models have the best coherence scores but the topics are way too granular on inspection. In fact, the LDA achieves similar coherence scores if the number of topics is increased. But to ensure human interpretability, the number of topics is restricted to a lower number. In addition, the HDP models are very unstable on retraining. The hashtag pooling LDA model has by far the highest coherence score of the LSI and LDA models which is in line with the result after human inspection.