In [None]:
from os import path
from pprint import pprint
import pandas as pd
from corputil import ListCorpus
from corputil.utils import load_stopwords
from gensim.models import LdaMulticore
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [None]:
num_topics = 15
chunksize, iterations, passes = 200, 500, 20
labels = ['2015KW44', '2015KW45', '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50',
          '2015KW51', '2015KW52', '2015KW53', '2016KW01']
files = [path.join('data', 'CurrentNews', 'Sentiment_{}.csv').format(label) for label in labels]
output_model = [path.join('models', 'lda', '{}.lda').format(label) for label in labels]
output_dict = path.join('models', 'lda', 'Words.dict')
output_bigram = path.join('models', 'lda', 'Bigram.phrase')

In [None]:
dfs = [pd.read_csv(file, sep='|', encoding='utf-8') for file in files]

In [None]:
corpora = [ListCorpus(list(df.loc[:, 'text'])) for df in dfs]

In [None]:
def create_phrase():
    sentences = [sentence for corpus in corpora for sentence in corpus.sentences_token(stopwords=stopwords)]
    bigram = Phrases(sentences)
    return bigram


def create_dict():
    docs = [bigram[doc] for corpus in corpora for doc in corpus.doc_token(stopwords)]
    dictionary = Dictionary(docs)
    dictionary.filter_extremes()
    dictionary.compactify()
    return dictionary


def train_lda(corpus):
    bow = [dictionary.doc2bow(bigram[doc]) for doc in corpus]
    lda = LdaMulticore(bow, id2word=dictionary, chunksize=chunksize, #batch=True,
                       num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
    return bow, lda

In [None]:
bigram = create_phrase()
dictionary = create_dict()

models = []
docs = []

for i, corpus in enumerate(corpora):
    mmCorpus, model = train_lda(corpus.doc_token(stopwords=stopwords))
    models.append(model)
    docs.append(mmCorpus)
    model.save(output_model[i])

bigram.save(output_bigram)
dictionary.save(output_dict)

## Topic Chains

In [None]:
import numpy as np
from gensim.matutils import sparse2full, cossim

In [None]:
def hellinger(vec1, vec2):
    dense1 = sparse2full(vec1, len(dictionary))
    dense2 = sparse2full(vec2, len(dictionary))
    return np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())

def permutations(coll, window):
    perms =[]
    for frame in range(len(coll) - (window - 1)):
        perm = [coll[frame + i] for i in range(window)]
        perms.append(perm)
    return perms

In [None]:
for first, second in permutations(models, 2):
    for i1 in range(first.num_topics):
        for i2 in range(second.num_topics):
            similarity = cossim(first.show_topic(i1), second.show_topic(i2))
            if(similarity > 0.5):
                print(similarity)

## Finalize Pipeline

In [None]:
from pprint import pprint
import json

In [None]:
def order_data(d):
    return [
        d['Linke'],
        d['SPD'],
        d['Gruene'],
        d['FDP'],
        d['CDU'],
        d['NPD']
    ]

def get_sentiment(df):
    group = df.groupby('site').mean()
    temp = group.loc[:, ['Linke', 'SPD', 'Gruene', 'FDP', 'CDU', 'NPD']].to_dict('index')
    for key in temp.keys():
        temp[key] = order_data(temp[key])
    temp['All'] = list(group.loc[:, ['Linke', 'SPD', 'Gruene', 'FDP', 'CDU', 'NPD']].mean())
    return temp

def get_topic_sentiment(i, df):
    topic = df[df['topic'] == i]
    group = topic.groupby('site').mean()
    temp = group.loc[:, ['Linke', 'SPD', 'Gruene', 'FDP', 'CDU', 'NPD']].to_dict('index')
    for key in temp.keys():
        temp[key] = order_data(temp[key])
    temp['All'] = list(group.loc[:, ['Linke', 'SPD', 'Gruene', 'FDP', 'CDU', 'NPD']].mean())
    return temp

def topic_words(model):
    data = model.show_topics(-1, formatted=False)
    topics = []
    for i, c in data:
        words = []
        for word, prob in c:
            words.append(word)
        topics.append(words)
    return topics

# Simply takes the topic with the highest prob, probably a bad idea... need to fix this.
def topic_allocation(corpus):
    acc = []
    for vec in corpus:
        t_id = -1
        t_prob = -1
        for topic, prob in vec:
            if prob > t_prob:
                t_id = topic
        acc.append(t_id)
    return acc

def get_topics(df, model, doc):
    transform = model[doc]
    topics = topic_words(model)
    df['topic'] = topic_allocation(transform)
    d = []
    for i, topic in enumerate(topics):
        dc = dict()
        dc['id'] = i
        dc['words'] = topic
        dc['articles'] = df[df['topic'] == i].count()['topic'].item() # Just pick a column... here topic
        dc['sentiment'] = get_topic_sentiment(i, df)
        if dc['articles'] > 0:
            d.append(dc)
    return d

In [None]:
for i, (model, doc) in enumerate(zip(models, docs)):
    d = dict()
    df = dfs[i]
    d['tag'] = labels[i]
    d['sentiment'] = get_sentiment(df)
    d['topics'] = get_topics(df, model, doc)
    with open(path.join('data', 'Web', '{}.json'.format(labels[i])), 'w', encoding='utf-8') as f:
        json.dump(d, f, indent=4)

## Visualization

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
matplotlib.rcParams['axes.titlesize'] = 24

from sklearn.manifold import TSNE
from sklearn.externals import joblib
from gensim.matutils import sparse2full

vectorizer = joblib.load(path.join('models', 'classifier', 'RAW_Vectorizer.pkl'))
classifier = joblib.load(path.join('models', 'classifier', 'RAW_Classifier.pkl'))

colors = {'Politics': 'red', 
          'Economy': 'blue', 
          'Science': 'green', 
          'Car': 'teal', 
          'Education': 'lime',  
          'Culture': 'purple', 
          'Society': 'orange', 
          'Travel': 'magenta', 
          'Sport': 'brown', 
          'Technology': 'cyan'}

In [None]:
model = models[0]
doc = docs[0]

tfidf = vectorizer.transform(dfs[0]['text'])
bow = [sparse2full(d, model.num_topics) for d in model[doc]]
tags = classifier.predict(tfidf)
labels = [colors[tag] for tag in tags]
reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(bow)
df = pd.DataFrame(reduced)

In [None]:
plt.scatter(df[0], df[1], c=labels, marker='x')