In [None]:
from time import time
from os import path
from gensim import corpora, models
from gensim.models.phrases import Phrases
from html2corpus import HTML2Corpus
from corputil import FileCorpus, MultiFileCorpus
from corputil.modifiers import to_words_sl, sentence_to_words
from pprint import pprint
from collections import defaultdict

In [None]:
file = path.join('data', 'Corpus_KW45.txt')
start = time()
feature_corpus = FileCorpus(file, modifier=sentence_to_words)
train_corpus = FileCorpus(file, modifier=to_words_sl)

In [None]:
bigram = Phrases(feature_corpus)
corpus = [bigram[doc] for doc in train_corpus]

dictionary = corpora.Dictionary(corpus)
dictionary.filter_extremes()
dictionary.compactify()

mmCorpus = [dictionary.doc2bow(doc) for doc in corpus]
lda = models.LdaMulticore(mmCorpus, id2word=dictionary, chunksize=500, batch=True, 
                          num_topics=40, workers=2, passes=10, iterations=500)

elapsed = time() - start
print(elapsed)

In [None]:
clean_topics = []
for topic_id, topics in lda.print_topics():
    topic_list = []
    for topic in topics.split('+'):
        topic_list.append(tuple(topic.strip().split('*')))
    clean_topics.append(topic_list)

flat = [l for inner in clean_topics for l in inner]
trends = defaultdict(float)
for p, trend in flat:
    trends[trend] += float(p)
top_trends = sorted(trends.items(), key=lambda item: -item[1])
pprint(top_trends)