In [None]:
from os import path
import pandas as pd
from pprint import pprint
from corputil import ListCorpus
from corputil.utils import load_stopwords
import gensim.matutils as matutils
from gensim.models import LdaMulticore
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [None]:
num_topics = 30
chunksize, iterations, passes = 200, 500, 20
labels = ['2015KW44', '2015KW45', '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50', '2015KW51']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
output_model = [path.join('models', 'lda', '{}.lda').format(label) for label in labels]
output_dict = path.join('models', 'lda', 'Words.dict')
output_bigram = path.join('models', 'lda', 'Bigram.phrase')

In [None]:
dfs = [pd.read_csv(file, sep='|', encoding='utf-8') for file in files]

In [None]:
corpora = [ListCorpus(list(df.loc[:, 'text'])) for df in dfs]

In [None]:
def create_phrase():
    sentences = [sentence for corpus in corpora for sentence in corpus.sentences_token(stopwords=stopwords)]
    bigram = Phrases(sentences)
    return bigram


def create_dict():
    docs = [bigram[doc] for corpus in corpora for doc in corpus.doc_token(stopwords)]
    dictionary = Dictionary(docs)
    dictionary.filter_extremes()
    dictionary.compactify()
    return dictionary


def train_lda(corpus):
    bow = [dictionary.doc2bow(bigram[doc]) for doc in corpus]
    lda = LdaMulticore(bow, id2word=dictionary, chunksize=chunksize, batch=True,
                       num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
    return bow, lda

In [None]:
bigram = create_phrase()
dictionary = create_dict()

models = []
docs = []

for i, corpus in enumerate(corpora):
    mmCorpus, model = train_lda(corpus.doc_token(stopwords=stopwords))
    models.append(model)
    docs.append(mmCorpus)
    model.save(output_model[i])

bigram.save(output_bigram)
dictionary.save(output_dict)

## TESTING

In [None]:
from sklearn.manifold import TSNE

i = 0
model = models[i]
doc = docs[i]
bow = [matutils.sparse2full(d, model.num_topics) for d in model[doc]]
reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(bow)
df = pd.DataFrame(reduced)

In [None]:
from sklearn.manifold import TSNE

model = models[i]
bow = [matutils.sparse2full(topic, model.num_topics) for t in model.show_topics(-1, formatted=False)]
bow

In [None]:
import matplotlib.pyplot as plt

plt.title('Topic Model Visualization')
plt.axis('off')
plt.scatter(df[0], df[1], marker='x')
plt.show()