## References

In [None]:
# https://radimrehurek.com/gensim/models/hdpmodel.html
# https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html

## Prepare Notebook

In [6]:
# import packages
from gensim.models import HdpModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import pandas as pd
import pickle
import logging
import os

In [4]:
# log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Training Corpora and Dictionaries

In [7]:
# load no pooling corpus
if (os.path.exists("tourism_no_pooling.dict")):
   dictionary_no_pooling = corpora.Dictionary.load('tourism_no_pooling.dict')
   corpus_no_pooling = corpora.MmCorpus('tourism_no_pooling.mm')
   print("Vectorized no pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# load user pooling corpus
if (os.path.exists("tourism_user_pooling.dict")):
   dictionary_user_pooling = corpora.Dictionary.load('tourism_user_pooling.dict')
   corpus_user_pooling = corpora.MmCorpus('tourism_user_pooling.mm')
   print("Vectorized user pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

# load hashtag pooling corpus
if (os.path.exists("tourism_hashtag_pooling.dict")):
   dictionary_hashtag_pooling = corpora.Dictionary.load('tourism_hashtag_pooling.dict')
   corpus_hashtag_pooling = corpora.MmCorpus('tourism_hashtag_pooling.mm')
   print("Vectorized hashtag pooling corpus loaded!")
else:
   print("Please run preprocessing script first!")

2018-09-23 15:07:55,653 : INFO : loading Dictionary object from tourism_no_pooling.dict
2018-09-23 15:07:55,660 : INFO : loaded tourism_no_pooling.dict
2018-09-23 15:07:55,665 : INFO : loaded corpus index from tourism_no_pooling.mm.index
2018-09-23 15:07:55,666 : INFO : initializing cython corpus reader from tourism_no_pooling.mm
2018-09-23 15:07:55,670 : INFO : accepted corpus with 7633 documents, 5035 features, 41550 non-zero entries
2018-09-23 15:07:55,673 : INFO : loading Dictionary object from tourism_user_pooling.dict
2018-09-23 15:07:55,678 : INFO : loaded tourism_user_pooling.dict
2018-09-23 15:07:55,682 : INFO : loaded corpus index from tourism_user_pooling.mm.index
2018-09-23 15:07:55,683 : INFO : initializing cython corpus reader from tourism_user_pooling.mm
2018-09-23 15:07:55,688 : INFO : accepted corpus with 4424 documents, 4185 features, 33668 non-zero entries
2018-09-23 15:07:55,692 : INFO : loading Dictionary object from tourism_hashtag_pooling.dict
2018-09-23 15:07:55

Vectorized no pooling corpus loaded!
Vectorized user pooling corpus loaded!
Vectorized hashtag pooling corpus loaded!


In [8]:
model = HdpModel(corpus_no_pooling, dictionary_no_pooling)
cm = CoherenceModel(model=model, corpus=corpus_no_pooling, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value

2018-09-23 15:12:36,665 : INFO : (0, "0.002*#hairporn + 0.002*school + 0.002*lager + 0.002*i'm + 0.001*collectivesound + 0.001*hahahaha + 0.001*becomes + 0.001*trying + 0.001*zoo + 0.001*#haircut")
2018-09-23 15:12:36,672 : INFO : (1, '0.002*#parcdellaberint + 0.001*woke + 0.001*language + 0.001*esta + 0.001*current + 0.001*song + 0.001*#catalanreferendum + 0.001*maestro + 0.001*bro + 0.001*major')
2018-09-23 15:12:36,682 : INFO : (2, '0.001*colores + 0.001*templo + 0.001*bucket + 0.001*cerca + 0.001*verdi + 0.001*#basilica + 0.001*son + 0.001*#vsco + 0.001*devotion + 0.001*//')
2018-09-23 15:12:36,693 : INFO : (3, '0.002*#graphicdesign + 0.001*foot + 0.001*clínic + 0.001*big + 0.001*vía + 0.001*true + 0.001*worth + 0.001*#abirradero + 0.001*#traveltheworld + 0.001*#opiumbarcelona')
2018-09-23 15:12:36,705 : INFO : (4, '0.002*#gracia + 0.002*torrades + 0.001*checking + 0.001*celebration + 0.001*catching + 0.001*#davidbowie + 0.001*#instalike + 0.001*cranny + 0.001*emerson_soc + 0.001*l

In [10]:
model = HdpModel(corpus_hashtag_pooling, dictionary_hashtag_pooling)
cm = CoherenceModel(model=model, corpus=corpus_hashtag_pooling, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value

TypeError: __init__() got an unexpected keyword argument 'passes'

In [None]:
hdpmodel.show_topics()

In [None]:
model = HdpModel(corpus_user_pooling, dictionary_user_pooling)
cm = CoherenceModel(model=model, corpus=corpus_hashtag_pooling, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value

## Load Test Documents

In [None]:
with open('tokenized_documents_district_pooling.p', 'rb') as fp:
    district_pooling_docs = pickle.load(fp)
    
with open('tokenized_documents_month_pooling.p', 'rb') as fp:
    month_pooling_docs = pickle.load(fp)
    
with open('tokenized_documents_district_per_month_pooling.p', 'rb') as fp:
    district_per_month_pooling_docs = pickle.load(fp)

In [None]:
# define function to run hdp on test documents
def run_hdp_on_test_doc(docs):
    bow_list = [dictionary.doc2bow(text) for text in docs]
    topic_list = []
    
    for index in range(len(bow_list)):
        bow = bow_list[index]
        topic_vector = hdp_model[bow]
        topic_list.append(topic_vector)
        
    return topic_list

In [None]:
topic_info = hdp.print_topics(num_topics=20, num_words=10)

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics