In [7]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
# logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [8]:
links = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt'),
    path.join('data', 'Politics', 'Linke_Fraktion.txt'),
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt')
]

liberal = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt'),
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt'),
    path.join('data', 'Politics', 'CDU.txt'),
    path.join('data', 'Politics', 'CDU_EU.txt'),
    path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

rechts = [
    path.join('data', 'Politics', 'NPD_MV.txt'),
    path.join('data', 'Politics', 'NPD_Sachsen.txt')
#    path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [links, liberal, rechts] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=6, size=200, window=3)
base.build_vocab(base_corpus)

In [9]:
from copy import deepcopy

corpora = [
    FileCorpus(links),
    FileCorpus(liberal),
    FileCorpus(rechts)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    shuffle(sentences)
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

Loaded corpus with 330507 sentences.
Loaded corpus with 594621 sentences.
Loaded corpus with 50114 sentences.


In [10]:
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', 
          '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]

In [11]:
import pandas as pd
import numpy as np

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

KW = pd.concat([pd.read_csv(file, sep='|', encoding='utf-8') for file in files], ignore_index=True)
prob = calc_probability(KW, models)
KW = pd.concat([KW, prob], axis=1)

In [12]:
KW.groupby('site').mean()

Unnamed: 0_level_0,0,1,2
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Berliner Zeitung,0.214467,0.116501,0.669032
Bild,0.323299,0.393159,0.283542
Der Postillon,0.305183,0.310191,0.384626
Deutsche Stimme,0.274436,0.229529,0.496035
FAZ,0.339688,0.379515,0.280797
Focus,0.354729,0.367865,0.277406
Frankfurter Rundschau,0.356871,0.367465,0.275665
Golem,0.406917,0.365555,0.227528
Handelsblatt,0.374497,0.399183,0.22632
Heise,0.382901,0.37962,0.237479
