In [22]:
from time import time
from os import path
from gensim import corpora, models
from gensim.models.phrases import Phrases
from corputil import FileCorpus, MultiFileCorpus
from corputil.modifiers import to_words_sl, sentence_to_words
from pprint import pprint
from collections import defaultdict

In [23]:
label = 'KW45'
num_topics = 15
# chunksize, iterations, passes = 500, 500, 10
chunksize, iterations, passes = 200, 500, 20
file = path.join('data', 'Corpus_{}.txt'.format(label))

train = FileCorpus(file, modifier=to_words_sl)

def train_lda(train):
    corpus = [trigram[bigram[doc]] for doc in train]

    dictionary = corpora.Dictionary(corpus)
    dictionary.filter_extremes()
    dictionary.compactify()

    mmCorpus = [dictionary.doc2bow(doc) for doc in corpus]
    lda = models.LdaMulticore(mmCorpus, id2word=dictionary, chunksize=chunksize, batch=True,
                              num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
    return (dictionary, mmCorpus, lda)

In [24]:
bigram = Phrases.load(path.join('models', 'Bigram.phrase'))
trigram = Phrases.load(path.join('models', 'Trigram.phrase'))
dictionary, corpus, model = train_lda(train)

In [25]:
for topic in model.print_topics(num_topics):
    pprint(topic)
    
print(model.bound(corpus))

(0,
 '0.002*sieg + 0.002*spiel + 0.002*pegida + 0.002*polizei + 0.002*platz + '
 '0.001*saison + 0.001*team + 0.001*anhänger + 0.001*tor + 0.001*forscher')
(1,
 '0.005*syrien + 0.004*polizei + 0.003*is + 0.003*kunden + 0.002*irak + '
 '0.002*eu + 0.002*abgeordneten + 0.002*entwurf + 0.002*sterbehilfe + '
 '0.002*luftangriffe')
(2,
 '0.018*flüchtlinge + 0.005*flüchtlingen + 0.004*berlin + 0.004*zahl + '
 '0.003*grenze + 0.003*türkei + 0.003*eu + 0.003*europa + 0.003*sprecher + '
 '0.003*österreich')
(3,
 '0.007*absturz + 0.005*ägypten + 0.005*bord + 0.005*bombe + 0.004*sinai + '
 '0.004*russland + 0.004*flüge + 0.004*is + 0.003*maschine + '
 '0.003*großbritannien')
(4,
 '0.007*unternehmen + 0.003*google + 0.002*frauen + 0.002*us + 0.002*bond + '
 '0.002*kunden + 0.002*internet + 0.002*film + 0.002*gerät + 0.002*projekt')
(5,
 '0.006*dfb + 0.005*niersbach + 0.003*staatsanwaltschaft + '
 '0.002*ermittlungen + 0.002*zwanziger + 0.002*unternehmen + 0.002*bachmann '
 '+ 0.002*schmidt + 0.002