In [1]:
from os import path
import pandas as pd
from pprint import pprint
from corputil import ListCorpus
from corputil.utils import load_stopwords
import gensim.matutils as matutils
from gensim.models import LdaMulticore
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary, MmCorpus

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [2]:
num_topics = 30
chunksize, iterations, passes = 300, 500, 20
labels = ['2015KW44', '2015KW45']#, '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50', '2015KW51']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
output_model = [path.join('models', 'lda', '{}.lda').format(label) for label in labels]
output_mm = [path.join('models', 'lda', '{}.mm').format(label) for label in labels]
output_dict = path.join('models', 'lda', 'KW.dict')

In [3]:
dfs = [pd.read_csv(file, sep='|', encoding='utf-8') for file in files]

In [4]:
corpora = [ListCorpus(list(df.loc[:, 'text'])) for df in dfs]

In [5]:
def create_dict():
    docs = [doc for corpus in corpora for doc in corpus.doc_token(stopwords)]
    dictionary = Dictionary(docs)
    dictionary.filter_extremes()
    dictionary.compactify()
    return dictionary

def train_lda(corpus):
    mmCorpus = [dictionary.doc2bow(doc) for doc in corpus]
    lda = LdaMulticore(mmCorpus, id2word=dictionary, chunksize=chunksize, batch=True,
                       num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
    return (mmCorpus, lda)

In [6]:
dictionary = create_dict()

models = []
docs = []

for i, corpus in enumerate(corpora):
    mmCorpus, model = train_lda(corpus.doc_token(stopwords=stopwords))
    models.append(model)
    docs.append(mmCorpus)
#     model.save(output_model[i])
#     MmCorpus.serialize(output_mm[i], mmCorpus)
    
# dictionary.save(output_dict)

In [8]:
model1 = models[0]
model2 = models[1]

docs1 = docs[0]
docs2 = docs[1]

In [None]:
model1.show_topics(20)
# model2.show_topics(20)

In [None]:
t = 544
matutils.cossim(model1[docs1[t]], model2[docs1[t]])

In [None]:
model1.get_document_topics(docs1[t])

In [13]:
model1.show_topic(23)

[('syrien', 0.022034317225854538),
 ('us', 0.018981312022546534),
 ('usa', 0.011871303370045149),
 ('iran', 0.01047379234250705),
 ('is', 0.010131056074599073),
 ('assad', 0.0097936443123707879),
 ('russland', 0.0091287457245521446),
 ('wien', 0.0089044267173654899),
 ('außenminister', 0.0082893655099760787),
 ('irak', 0.0066244534386278101)]

In [None]:
model2.get_document_topics(docs1[t])

In [12]:
model2.show_topic(23)

[('is', 0.02691812266633041),
 ('syrien', 0.026510604041244543),
 ('russland', 0.011456098885933576),
 ('irak', 0.011024665609019109),
 ('russische', 0.0092358127275147112),
 ('syrischen', 0.0090960182346914772),
 ('luftangriffe', 0.0087610671884399503),
 ('al', 0.0085283674809775844),
 ('assad', 0.0079705363394327912),
 ('staat', 0.0077530258824603186)]

In [14]:
matutils.cossim(model1.show_topic(23), model2.show_topic(23))

0.60957306038720294