# Calculate Political Opinion Models

In [58]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [59]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt')
#     path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt')
#     path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt')
#     path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
#     path.join('data', 'Politics', 'NPD_MV.txt'),
    path.join('data', 'Politics', 'NPD_Sachsen.txt')
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [spd, linke, gruene, fdp, cdu, npd] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=6, size=200, window=3)
base.build_vocab(base_corpus)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 79560 words, keeping 18381 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 163159 words, keeping 27959 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 237379 words, keeping 36911 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 309087 words, keeping 43507 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 386437 words, keeping 50594 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 468583 words, keeping 56982 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 548054 words, keeping 62117 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 628164 words, keeping 66

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [60]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [61]:
from copy import deepcopy

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    shuffle(sentences)
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 12.74% examples, 336436 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.69% examples, 339647 words/s
INFO:gensim.models.word2vec:PROGRESS: at 38.64% examples, 340739 words/s
INFO:gensim.models.word2vec:PROGRESS: at 51.53% examples, 341171 words/s
INFO:gensim.models.word2vec:PROGRESS: at 64.48% examples, 341469 words/s
INFO:gensim.models.word2vec:PROGRESS: at 76.95% examples, 339376 words/s
INFO:gensim.models.word2vec:PROGRESS: at 89.88% examples, 339783 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 2816490 raw words took 7.8s, 339896 trained words/s
INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS:

Loaded corpus with 69350 sentences.
Loaded corpus with 148844 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 70.74% examples, 361868 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 545634 raw words took 1.4s, 360116 trained words/s



Loaded corpus with 11592 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 2.78% examples, 329816 words/s
INFO:gensim.models.word2vec:PROGRESS: at 5.52% examples, 328916 words/s
INFO:gensim.models.word2vec:PROGRESS: at 8.27% examples, 328794 words/s
INFO:gensim.models.word2vec:PROGRESS: at 11.03% examples, 328584 words/s
INFO:gensim.models.word2vec:PROGRESS: at 13.71% examples, 326981 words/s
INFO:gensim.models.word2vec:PROGRESS: at 16.45% examples, 327192 words/s
INFO:gensim.models.word2vec:PROGRESS: at 19.20% examples, 327200 words/s
INFO:gensim.models.word2vec:PROGRESS: at 21.96% examples, 327429 words/s
INFO:gensim.models.word2vec:PROGRESS: at 24.72% examples, 327784 words/s
INFO:gensim.models.word2vec:PROGRESS: at 27.48% examples, 327769 words/s
INFO:gensim.models.word2vec:PROGRESS: at 30.15% examples, 326993 words/s
INFO:gensim.models.word2vec:PROGRESS: at 32.92% examples, 327265


Loaded corpus with 360090 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 119214 raw words took 0.3s, 329550 trained words/s



Loaded corpus with 2790 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 20.66% examples, 374955 words/s
INFO:gensim.models.word2vec:PROGRESS: at 41.48% examples, 375115 words/s
INFO:gensim.models.word2vec:PROGRESS: at 61.97% examples, 374137 words/s
INFO:gensim.models.word2vec:PROGRESS: at 82.51% examples, 373727 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 1986900 raw words took 4.9s, 374128 trained words/s



Loaded corpus with 30338 sentences.


## Classification

In [62]:
labels = ['2015KW44', '2015KW45', '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50', '2015KW51', 
          '2015KW52', '2015KW53', '2016KW01']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
output = [path.join('data', 'CurrentNews', 'Sentiment_{}.csv').format(label) for label in labels]

In [66]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

def process(data):
    sentiment = calc_probability(data, models)
    return sentiment

# KW = pd.read_csv(path.join('data', 'CurrentNews', '2015KW45.csv'), sep='|', encoding='utf-8')
# prob = calc_probability(KW, models)
# # prob = prob.div(prob.sum(axis=1), axis=0)
# # prob = prob.sub(.16, axis=0)
# KW = pd.concat([KW, prob], axis=1)

for file, out in zip(files, output):
    data = pd.read_csv(file, sep='|', encoding='utf-8')
    sentiment = process(data)
    csv = pd.concat([data, sentiment], axis=1)
    csv.rename(columns={ 0: 'Linke', 1: 'SPD', 2: 'Gruene', 3: 'FDP', 4: 'CDU', 5: 'NPD'  }, inplace=True)
    csv.to_csv(out, index=False, encoding='utf-8', sep='|')

INFO:gensim.models.word2vec:scoring sentences with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.8s, 93625 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.8s, 89873 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 57420 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.8s, 90210 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 57420 vocabu

In [64]:
KW.groupby('site').mean()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Deutsche Stimme,0.13649,0.180363,0.057929,0.043724,0.119205,0.462288
FAZ,0.184752,0.150003,0.148114,0.175101,0.193762,0.148267
Focus,0.169547,0.130964,0.122636,0.168041,0.208372,0.20044
Frankfurter Rundschau,0.129218,0.140776,0.118108,0.161955,0.229791,0.220151
Golem,0.071534,0.240318,0.14846,0.184591,0.25252,0.102577
Handelsblatt,0.1584,0.170878,0.120105,0.186125,0.220401,0.144091
Heise,0.074142,0.231459,0.123365,0.18315,0.282255,0.105629
Huffington Post,0.165205,0.135719,0.142669,0.194036,0.202438,0.159933
Junge Freiheit,0.13685,0.166938,0.095435,0.162837,0.171191,0.266749
Junge Welt,0.252063,0.130522,0.135909,0.136008,0.18678,0.158717


In [65]:
KW.groupby('site').median()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Deutsche Stimme,0.13649,0.180363,0.057929,0.043724,0.119205,0.462288
FAZ,0.16834,0.14258,0.147157,0.177375,0.193305,0.125277
Focus,0.156201,0.134349,0.115172,0.144879,0.212745,0.154171
Frankfurter Rundschau,0.124845,0.142616,0.102011,0.161611,0.226843,0.154091
Golem,0.074822,0.22426,0.135213,0.158004,0.249627,0.086722
Handelsblatt,0.15248,0.154918,0.115809,0.157317,0.229697,0.143778
Heise,0.080255,0.235726,0.119437,0.194606,0.289699,0.106492
Huffington Post,0.14411,0.122674,0.126317,0.179515,0.19533,0.130732
Junge Freiheit,0.122741,0.167613,0.087083,0.127178,0.146592,0.240273
Junge Welt,0.207659,0.117422,0.126253,0.129251,0.17957,0.141422
