## References

In [1]:
# https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html
# https://medium.com/square-corner-blog/topic-modeling-optimizing-for-human-interpretability-48a81f6ce0ed

## Prepare Notebook

In [2]:
# import packages
import logging
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from gensim import corpora, models
from gensim.models import HdpModel, LsiModel, CoherenceModel



In [3]:
# log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Training Corpora and Dictionaries

In [4]:
# load no pooling corpus
if (os.path.exists("tourism_no_pooling.dict")):
   dictionary_no_pooling = corpora.Dictionary.load('tourism_no_pooling.dict')
   corpus_no_pooling = corpora.MmCorpus('tourism_no_pooling.mm')
   print("Vectorized no pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# load user pooling corpus
if (os.path.exists("tourism_user_pooling.dict")):
   dictionary_user_pooling = corpora.Dictionary.load('tourism_user_pooling.dict')
   corpus_user_pooling = corpora.MmCorpus('tourism_user_pooling.mm')
   print("Vectorized user pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# load hashtag pooling corpus
if (os.path.exists("tourism_hashtag_pooling.dict")):
   dictionary_hashtag_pooling = corpora.Dictionary.load('tourism_hashtag_pooling.dict')
   corpus_hashtag_pooling = corpora.MmCorpus('tourism_hashtag_pooling.mm')
   print("Vectorized hashtag pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

2018-10-05 05:42:33,319 : INFO : loading Dictionary object from tourism_no_pooling.dict
2018-10-05 05:42:33,328 : INFO : loaded tourism_no_pooling.dict
2018-10-05 05:42:33,333 : INFO : loaded corpus index from tourism_no_pooling.mm.index
2018-10-05 05:42:33,335 : INFO : initializing cython corpus reader from tourism_no_pooling.mm
2018-10-05 05:42:33,339 : INFO : accepted corpus with 7633 documents, 4790 features, 37016 non-zero entries
2018-10-05 05:42:33,342 : INFO : loading Dictionary object from tourism_user_pooling.dict
2018-10-05 05:42:33,348 : INFO : loaded tourism_user_pooling.dict
2018-10-05 05:42:33,352 : INFO : loaded corpus index from tourism_user_pooling.mm.index
2018-10-05 05:42:33,353 : INFO : initializing cython corpus reader from tourism_user_pooling.mm
2018-10-05 05:42:33,357 : INFO : accepted corpus with 4424 documents, 3990 features, 30119 non-zero entries
2018-10-05 05:42:33,360 : INFO : loading Dictionary object from tourism_hashtag_pooling.dict
2018-10-05 05:42:33

Vectorized no pooling corpus loaded!
Vectorized user pooling corpus loaded!
Vectorized hashtag pooling corpus loaded!


## Load Documents

In [5]:
# load no pooling documents
with open ('tokenized_documents_no_pooling.p', 'rb') as fp:
    tokenized_documents_no_pooling = pickle.load(fp)

# load user pooling documents
with open('tokenized_documents_user_pooling.p', 'rb') as fp:
    tokenized_documents_user_pooling = pickle.load(fp)

# load hashtag pooling documents
with open('tokenized_documents_hashtag_pooling.p', 'rb') as fp:
    tokenized_documents_hashtag_pooling = pickle.load(fp)

## Load LDA Models (Trained in TopicModeling#2 Script)

In [6]:
# load models
lda_model_no_pooling = models.LdaModel.load('lda_model_no_pooling.model') # 6 topics
lda_model_user_pooling = models.LdaModel.load('lda_model_user_pooling.model') # 7 topics
lda_model_hashtag_pooling = models.LdaModel.load('lda_model_hashtag_pooling.model') # 7 topics

2018-10-05 05:42:37,124 : INFO : loading LdaModel object from lda_model_no_pooling.model
2018-10-05 05:42:37,130 : INFO : loading expElogbeta from lda_model_no_pooling.model.expElogbeta.npy with mmap=None
2018-10-05 05:42:37,135 : INFO : setting ignored attribute dispatcher to None
2018-10-05 05:42:37,137 : INFO : setting ignored attribute id2word to None
2018-10-05 05:42:37,139 : INFO : setting ignored attribute state to None
2018-10-05 05:42:37,141 : INFO : loaded lda_model_no_pooling.model
2018-10-05 05:42:37,144 : INFO : loading LdaState object from lda_model_no_pooling.model.state
2018-10-05 05:42:37,150 : INFO : loaded lda_model_no_pooling.model.state
2018-10-05 05:42:37,158 : INFO : loading LdaModel object from lda_model_user_pooling.model
2018-10-05 05:42:37,161 : INFO : loading expElogbeta from lda_model_user_pooling.model.expElogbeta.npy with mmap=None
2018-10-05 05:42:37,165 : INFO : setting ignored attribute dispatcher to None
2018-10-05 05:42:37,166 : INFO : setting ignore

In [7]:
# extract topics with word probabilities
lda_topics_no_pooling = lda_model_no_pooling.show_topics(formatted=False)
lda_topics_user_pooling = lda_model_user_pooling.show_topics(formatted=False)
lda_topics_hashtag_pooling = lda_model_hashtag_pooling.show_topics(formatted=False)

## Train HDP Models

In [8]:
# train HDP models with different pooling methods (similar to LDA in TopicModeling#2 Script)
hdp_model_no_pooling = HdpModel(corpus_no_pooling, dictionary_no_pooling)
hdp_model_user_pooling = HdpModel(corpus_user_pooling, dictionary_user_pooling)
hdp_model_hashtag_pooling = HdpModel(corpus_hashtag_pooling, dictionary_hashtag_pooling)

2018-10-05 05:43:04,192 : INFO : (0, '0.002*#nervo + 0.002*#w + 0.001*turn + 0.001*familiabarcelona + 0.001*expected + 0.001*ignore + 0.001*travel + 0.001*oh + 0.001*#amigas + 0.001*transformation')
2018-10-05 05:43:04,198 : INFO : (1, '0.002*#dogsofinstagram + 0.002*#blackandwhitephotography + 0.001*#cbf + 0.001*#instabcn + 0.001*#stage + 0.001*#castanyada + 0.001*cuberta + 0.001*audio + 0.001*detail + 0.001*#lapedrera#lapedrera#gaudibarcelona#modernisme')
2018-10-05 05:43:04,205 : INFO : (2, '0.002*lifetime + 0.002*aged + 0.002*#esquancorroquehiveigclar + 0.002*mass + 0.002*someone + 0.001*self + 0.001*#contigomismo + 0.001*stuff + 0.001*bisbe + 0.001*humble')
2018-10-05 05:43:04,213 : INFO : (3, '0.002*#sculpture + 0.002*#landscape + 0.002*#señorthiemedoesspain + 0.002*wonder + 0.002*endangered + 0.001*miguel + 0.001*style + 0.001*unicorn + 0.001*#pride + 0.001*seat')
2018-10-05 05:43:04,220 : INFO : (4, '0.002*#loveisintheair + 0.002*#urbanphoto + 0.002*emotion + 0.001*right + 0.00

2018-10-05 05:43:19,883 : INFO : (12, '0.002*learned + 0.002*today + 0.002*#girl + 0.002*ph + 0.002*brewing + 0.002*#shooting + 0.002*sport + 0.002*malt + 0.002*lime + 0.002*lightforms')
2018-10-05 05:43:19,891 : INFO : (13, '0.002*authentic + 0.002*#g + 0.002*familia + 0.002*cruïlla + 0.002*sagrada + 0.002*bigger + 0.002*#run + 0.002*barri + 0.002*monster + 0.002*#sketchbook')
2018-10-05 05:43:19,898 : INFO : (14, "0.003*bonjour + 0.002*we've + 0.002*chip + 0.002*rooftop + 0.002*pay + 0.002*#friendshipgoals + 0.002*sagrada + 0.002*nit + 0.002*#painting + 0.002*movimiento")
2018-10-05 05:43:19,904 : INFO : (15, '0.003*drop + 0.002*#gato + 0.002*#top + 0.002*sunset + 0.002*#ootd + 0.002*proof + 0.002*shot + 0.002*heading + 0.002*word + 0.002*bio')
2018-10-05 05:43:19,914 : INFO : (16, '0.002*effective + 0.002*universal + 0.002*#catalunya + 0.002*punto + 0.002*quack + 0.002*need + 0.002*sunlight + 0.002*lee + 0.002*boca + 0.001*afraid')
2018-10-05 05:43:19,927 : INFO : (17, '0.003*depart

In [9]:
# exctract topics with word probabilities
hdp_topics_no_pooling = hdp_model_no_pooling.show_topics(formatted=False)
hdp_topics_user_pooling = hdp_model_user_pooling.show_topics(formatted=False)
hdp_topics_hashtag_pooling = hdp_model_hashtag_pooling.show_topics(formatted=False)

## Train LSI Models

In [10]:
# train LSI models with similar configurations as LDA models
lsi_model_no_pooling = LsiModel(corpus=corpus_no_pooling, num_topics=6, id2word=dictionary_no_pooling)
lsi_model_user_pooling = LsiModel(corpus=corpus_user_pooling, num_topics=7, id2word=dictionary_user_pooling)
lsi_model_hashtag_pooling = LsiModel(corpus=corpus_hashtag_pooling, num_topics=7, id2word=dictionary_hashtag_pooling)

2018-10-05 05:43:48,008 : INFO : using serial LSI version on this node
2018-10-05 05:43:48,013 : INFO : updating model with new documents
2018-10-05 05:43:48,063 : INFO : preparing a new chunk of documents
2018-10-05 05:43:48,095 : INFO : using 100 extra samples and 2 power iterations
2018-10-05 05:43:48,097 : INFO : 1st phase: constructing (4790, 106) action matrix
2018-10-05 05:43:48,154 : INFO : orthonormalizing (4790, 106) action matrix
2018-10-05 05:43:48,311 : INFO : 2nd phase: running dense svd on (106, 7633) matrix
2018-10-05 05:43:48,412 : INFO : computing the final decomposition
2018-10-05 05:43:48,413 : INFO : keeping 6 factors (discarding 64.750% of energy spectrum)
2018-10-05 05:43:48,418 : INFO : processed documents up to #7633
2018-10-05 05:43:48,421 : INFO : topic #0(48.355): 0.783*"sagrada" + 0.533*"familia" + 0.213*"basílica" + 0.189*"família" + 0.103*"#sagradafamilia" + 0.033*"#gaudi" + 0.032*"beautiful" + 0.027*"gaudi" + 0.024*"inside" + 0.022*"basilica"
2018-10-05 

In [11]:
# extract topics with word probabilities
lsi_topics_no_pooling = lsi_model_no_pooling.show_topics(formatted=False)
lsi_topics_user_pooling = lsi_model_user_pooling.show_topics(formatted=False)
lsi_topics_hashtag_pooling = lsi_model_hashtag_pooling.show_topics(formatted=False)

## Calculate Coherence Values

In [None]:
# extract top topic words
lsi_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_no_pooling]
lsi_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_user_pooling]
lsi_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in lsi_topics_hashtag_pooling]

hdp_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_no_pooling]
hdp_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_user_pooling]
hdp_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in hdp_topics_hashtag_pooling]

lda_topics_no_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_no_pooling]
lda_topics_user_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_user_pooling]
lda_topics_hashtag_pooling = [[word for word, prob in topic] for topicid, topic in lda_topics_hashtag_pooling]

In [None]:
# calculate coherences
lsi_no_pooling_coherence = CoherenceModel(topics=lsi_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
lsi_user_pooling_coherence = CoherenceModel(topics=lsi_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
lsi_hashtag_pooling_coherence = CoherenceModel(topics=lsi_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

hdp_no_pooling_coherence = CoherenceModel(topics=hdp_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
hdp_user_pooling_coherence = CoherenceModel(topics=hdp_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
hdp_hashtag_pooling_coherence = CoherenceModel(topics=hdp_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

lda_no_pooling_coherence = CoherenceModel(topics=lda_topics_no_pooling, texts=tokenized_documents_no_pooling, dictionary=dictionary_no_pooling, window_size=10).get_coherence()
lda_user_pooling_coherence = CoherenceModel(topics=lda_topics_user_pooling, texts=tokenized_documents_user_pooling, dictionary=dictionary_user_pooling, window_size=10).get_coherence()
lda_hashtag_pooling_coherence = CoherenceModel(topics=lda_topics_hashtag_pooling, texts=tokenized_documents_hashtag_pooling, dictionary=dictionary_hashtag_pooling, window_size=10).get_coherence()

2018-10-05 05:43:49,601 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2018-10-05 05:43:54,200 : INFO : 1 batches submitted to accumulate stats from 64 documents (146 virtual)
2018-10-05 05:43:54,203 : INFO : 2 batches submitted to accumulate stats from 128 documents (236 virtual)
2018-10-05 05:43:54,207 : INFO : 3 batches submitted to accumulate stats from 192 documents (330 virtual)
2018-10-05 05:43:54,212 : INFO : 4 batches submitted to accumulate stats from 256 documents (474 virtual)
2018-10-05 05:43:54,216 : INFO : 5 batches submitted to accumulate stats from 320 documents (604 virtual)
2018-10-05 05:43:54,219 : INFO : 6 batches submitted to accumulate stats from 384 documents (752 virtual)
2018-10-05 05:43:54,222 : INFO : 7 batches submitted to accumulate stats from 448 documents (821 virtual)
2018-10-05 05:43:54,227 : INFO : 8 batches submitted to accumulate stats from 512 documents (952 virtual)
2018-

2018-10-05 05:43:59,978 : INFO : 20 batches submitted to accumulate stats from 1280 documents (26019 virtual)
2018-10-05 05:43:59,984 : INFO : 21 batches submitted to accumulate stats from 1344 documents (26126 virtual)
2018-10-05 05:43:59,998 : INFO : 22 batches submitted to accumulate stats from 1408 documents (26272 virtual)
2018-10-05 05:44:00,004 : INFO : 23 batches submitted to accumulate stats from 1472 documents (26335 virtual)
2018-10-05 05:44:00,012 : INFO : 24 batches submitted to accumulate stats from 1536 documents (26382 virtual)
2018-10-05 05:44:00,021 : INFO : 25 batches submitted to accumulate stats from 1600 documents (26443 virtual)
2018-10-05 05:44:00,044 : INFO : 26 batches submitted to accumulate stats from 1664 documents (26528 virtual)
2018-10-05 05:44:00,055 : INFO : 27 batches submitted to accumulate stats from 1728 documents (26622 virtual)
2018-10-05 05:44:00,060 : INFO : 28 batches submitted to accumulate stats from 1792 documents (26701 virtual)
2018-10-05

2018-10-05 05:44:10,742 : INFO : 21 batches submitted to accumulate stats from 1344 documents (2660 virtual)
2018-10-05 05:44:10,746 : INFO : 22 batches submitted to accumulate stats from 1408 documents (2738 virtual)
2018-10-05 05:44:10,750 : INFO : 23 batches submitted to accumulate stats from 1472 documents (2758 virtual)
2018-10-05 05:44:10,754 : INFO : 24 batches submitted to accumulate stats from 1536 documents (2889 virtual)
2018-10-05 05:44:10,758 : INFO : 25 batches submitted to accumulate stats from 1600 documents (2911 virtual)
2018-10-05 05:44:10,762 : INFO : 26 batches submitted to accumulate stats from 1664 documents (2932 virtual)
2018-10-05 05:44:10,767 : INFO : 27 batches submitted to accumulate stats from 1728 documents (3041 virtual)
2018-10-05 05:44:10,771 : INFO : 28 batches submitted to accumulate stats from 1792 documents (3171 virtual)
2018-10-05 05:44:10,775 : INFO : 29 batches submitted to accumulate stats from 1856 documents (3258 virtual)
2018-10-05 05:44:10

2018-10-05 05:44:17,851 : INFO : 41 batches submitted to accumulate stats from 2624 documents (33271 virtual)
2018-10-05 05:44:17,853 : INFO : 42 batches submitted to accumulate stats from 2688 documents (33432 virtual)
2018-10-05 05:44:17,866 : INFO : 43 batches submitted to accumulate stats from 2752 documents (33458 virtual)
2018-10-05 05:44:17,924 : INFO : 3 accumulators retrieved from output queue
2018-10-05 05:44:17,980 : INFO : accumulated word occurrence stats for 35228 virtual documents
2018-10-05 05:44:19,331 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2018-10-05 05:44:24,159 : INFO : 1 batches submitted to accumulate stats from 64 documents (2029 virtual)
2018-10-05 05:44:24,161 : INFO : 2 batches submitted to accumulate stats from 128 documents (2609 virtual)
2018-10-05 05:44:24,166 : INFO : 3 batches submitted to accumulate stats from 192 documents (3712 virtual)
2018-10-05 05:44:24,170 : INFO 

In [None]:
# define function to plot a graph to compare coherence scores
def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center', color=['red','red','red','green','green','green','blue','blue','blue'])
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

In [None]:
# compare coherence scores
evaluate_bar_graph([lsi_no_pooling_coherence, lsi_user_pooling_coherence, lsi_hashtag_pooling_coherence,
                    hdp_no_pooling_coherence, hdp_user_pooling_coherence, hdp_hashtag_pooling_coherence,
                    lda_no_pooling_coherence, lda_user_pooling_coherence, lda_hashtag_pooling_coherence],
                   ['LSI1', 'LSI2', 'LSI3', 'HDP1', 'HDP2',
                    'HDP3', 'LDA1', 'LDA2', 'LDA3'])
plt.savefig("topic_model_comparison")
plt.show()

#### The HDP models have the best coherence scores but the topics are way too granular on inspection. In fact, the LDA achieves similar coherence scores if the number of topics is increased. But to ensure human interpretability, the number of topics is restricted to a lower number. In addition, the HDP models are very unstable on retraining. The hashtag pooling LDA model has by far the highest coherence score of the LSI and LDA models which is in line with the result after human inspection.