# Calculate Political Opinion Models

In [18]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [19]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt')
#     path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt')
#     path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt')
#     path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
    path.join('data', 'Politics', 'NPD_MV.txt')
#     path.join('data', 'Politics', 'NPD_Sachsen.txt'),
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [spd, linke, gruene, fdp, cdu, npd] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=6, size=200, window=3)
base.build_vocab(base_corpus)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 79711 words, keeping 18385 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 163674 words, keeping 27963 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 238232 words, keeping 36915 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 310327 words, keeping 43511 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 388556 words, keeping 50598 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 471836 words, keeping 56986 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 552429 words, keeping 62121 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 633607 words, keeping 66

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [20]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [21]:
from copy import deepcopy

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    shuffle(sentences)
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 12.71% examples, 340859 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.55% examples, 342220 words/s
INFO:gensim.models.word2vec:PROGRESS: at 38.48% examples, 343428 words/s
INFO:gensim.models.word2vec:PROGRESS: at 51.21% examples, 343091 words/s
INFO:gensim.models.word2vec:PROGRESS: at 64.05% examples, 343404 words/s
INFO:gensim.models.word2vec:PROGRESS: at 76.74% examples, 342731 words/s
INFO:gensim.models.word2vec:PROGRESS: at 87.82% examples, 336115 words/s
INFO:gensim.models.word2vec:PROGRESS: at 98.41% examples, 329741 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 2834532 raw words took 8.2s, 328910 trained words/s
INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, usin

Loaded corpus with 69350 sentences.
Loaded corpus with 148844 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 5.69% examples, 333883 words/s
INFO:gensim.models.word2vec:PROGRESS: at 11.91% examples, 348919 words/s
INFO:gensim.models.word2vec:PROGRESS: at 18.01% examples, 351855 words/s
INFO:gensim.models.word2vec:PROGRESS: at 24.22% examples, 354427 words/s
INFO:gensim.models.word2vec:PROGRESS: at 29.90% examples, 350101 words/s
INFO:gensim.models.word2vec:PROGRESS: at 35.47% examples, 346063 words/s
INFO:gensim.models.word2vec:PROGRESS: at 40.91% examples, 342230 words/s
INFO:gensim.models.word2vec:PROGRESS: at 46.80% examples, 342399 words/s
INFO:gensim.models.word2vec:PROGRESS: at 53.06% examples, 345225 words/s
INFO:gensim.models.word2vec:PROGRESS: at 59.48% examples, 348286 words/s
INFO:gensim.models.word2vec:PROGRESS: at 65.51% examples, 348690 words/s
INFO:gensim.models.word2vec:PROGRESS: at 71.54% examples, 3490


Loaded corpus with 130818 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 2.83% examples, 342102 words/s
INFO:gensim.models.word2vec:PROGRESS: at 5.70% examples, 343205 words/s
INFO:gensim.models.word2vec:PROGRESS: at 8.54% examples, 343123 words/s
INFO:gensim.models.word2vec:PROGRESS: at 11.40% examples, 343498 words/s
INFO:gensim.models.word2vec:PROGRESS: at 14.25% examples, 343328 words/s
INFO:gensim.models.word2vec:PROGRESS: at 17.14% examples, 344295 words/s
INFO:gensim.models.word2vec:PROGRESS: at 20.06% examples, 345281 words/s
INFO:gensim.models.word2vec:PROGRESS: at 22.99% examples, 346110 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.87% examples, 346233 words/s
INFO:gensim.models.word2vec:PROGRESS: at 28.74% examples, 346412 words/s
INFO:gensim.models.word2vec:PROGRESS: at 31.60% examples, 346179 words/s
INFO:gensim.models.word2vec:PROGRESS: at 34.47% examples, 346135


Loaded corpus with 360090 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 119472 raw words took 0.3s, 357087 trained words/s



Loaded corpus with 2790 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 41.21% examples, 373132 words/s
INFO:gensim.models.word2vec:PROGRESS: at 82.42% examples, 373194 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 990702 raw words took 2.4s, 374707 trained words/s



Loaded corpus with 19776 sentences.


## Classification

In [22]:
labels = ['2015KW44', '2015KW45', '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50', '2015KW51', 
          '2015KW52', '2015KW53', '2016KW01']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
output = [path.join('data', 'CurrentNews', 'Sentiment_{}.csv').format(label) for label in labels]

In [27]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

def process(data):
    sentiment = calc_probability(data, models)
    return sentiment

# KW = pd.read_csv(path.join('data', 'CurrentNews', '2015KW45.csv'), sep='|', encoding='utf-8')
# prob = calc_probability(KW, models)
# # prob = prob.div(prob.sum(axis=1), axis=0)
# # prob = prob.sub(.16, axis=0)
# KW = pd.concat([KW, prob], axis=1)

for file, out in zip(files, output):
    data = pd.read_csv(file, sep='|', encoding='utf-8')
    sentiment = process(data)
    csv = pd.concat([data, sentiment], axis=1)
    csv.rename(columns={ 0: 'Linke', 1: 'SPD', 2: 'Gruene', 3: 'FDP', 4: 'CDU', 5: 'NPD'  }, inplace=True)
    csv.to_csv(out, index=False, encoding='utf-8', sep='|')

INFO:gensim.models.word2vec:scoring sentences with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.7s, 99499 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.7s, 97284 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 63177 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.7s, 97350 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 63177 vocabu

In [17]:
KW.groupby('site').mean()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Deutsche Stimme,0.143481,0.152666,0.055975,0.074863,0.127574,0.445442
FAZ,0.190342,0.141653,0.139456,0.165006,0.220801,0.142741
Focus,0.185337,0.137742,0.127596,0.13899,0.221386,0.18895
Frankfurter Rundschau,0.154533,0.141748,0.119125,0.163003,0.242871,0.178719
Golem,0.074173,0.218491,0.121137,0.195631,0.267314,0.123253
Handelsblatt,0.170205,0.154098,0.134361,0.169581,0.230604,0.141151
Heise,0.087215,0.239453,0.144817,0.164097,0.253515,0.110903
Huffington Post,0.1941,0.115522,0.150165,0.18488,0.207407,0.147926
Junge Freiheit,0.17156,0.175352,0.089988,0.153248,0.182936,0.226916
Junge Welt,0.275806,0.114122,0.125237,0.116283,0.215179,0.153374


In [26]:
KW.groupby('site').mean()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Deutsche Stimme,0.134316,0.133785,0.095304,0.076921,0.132382,0.427291
FAZ,0.193461,0.132037,0.162553,0.156245,0.209193,0.146512
Focus,0.180777,0.129777,0.139876,0.152575,0.212519,0.184477
Frankfurter Rundschau,0.163146,0.135052,0.124147,0.149812,0.236434,0.191409
Golem,0.066001,0.22706,0.154085,0.136155,0.280229,0.13647
Handelsblatt,0.154881,0.147685,0.164981,0.173967,0.206587,0.151899
Heise,0.085268,0.204673,0.157906,0.189583,0.257963,0.104608
Huffington Post,0.185607,0.112101,0.164072,0.181179,0.211596,0.145445
Junge Freiheit,0.156194,0.160524,0.101973,0.153493,0.195265,0.232551
Junge Welt,0.25623,0.108804,0.155928,0.125893,0.198824,0.154321
