# Calculate Political Opinion Models

In [92]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [93]:
from corputil import FileCorpus
from gensim.models.word2vec import Word2Vec

spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#    path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
#    path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
#     path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt'),
#     path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt'),
    path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
    path.join('data', 'Politics', 'NPD_MV.txt'),
    path.join('data', 'Politics', 'NPD_Sachsen.txt')
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [spd, linke, gruene, fdp, cdu, npd] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=4, size=200, window=10)
base.build_vocab(base_corpus)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 82452 words, keeping 18390 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 168831 words, keeping 27968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 244315 words, keeping 36920 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 316825 words, keeping 43516 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 395523 words, keeping 50603 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 479198 words, keeping 56991 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 560202 words, keeping 62126 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 641777 words, keeping 66

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [94]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [95]:
from copy import deepcopy

corpora = [
    FileCorpus(spd), 
    FileCorpus(linke), 
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

INFO:gensim.models.word2vec:training model with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 5.58% examples, 253557 words/s
INFO:gensim.models.word2vec:PROGRESS: at 11.30% examples, 256422 words/s
INFO:gensim.models.word2vec:PROGRESS: at 16.80% examples, 256083 words/s
INFO:gensim.models.word2vec:PROGRESS: at 22.46% examples, 250950 words/s
INFO:gensim.models.word2vec:PROGRESS: at 28.27% examples, 248763 words/s
INFO:gensim.models.word2vec:PROGRESS: at 33.91% examples, 247947 words/s
INFO:gensim.models.word2vec:PROGRESS: at 38.51% examples, 243325 words/s
INFO:gensim.models.word2vec:PROGRESS: at 42.53% examples, 236374 words/s
INFO:gensim.models.word2vec:PROGRESS: at 48.24% examples, 235301 words/s
INFO:gensim.models.word2vec:PROGRESS: at 52.94% examples, 231796 words/s
INFO:gensim.models.word2vec:PROGRESS: at 57.43% examples, 228001 words/s
INFO:gensim.models.word2vec:PROGRESS: at 62.13% examples, 2273

Loaded corpus with 148844 sentences.
Loaded corpus with 24315 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 4.83% examples, 183982 words/s
INFO:gensim.models.word2vec:PROGRESS: at 9.92% examples, 192715 words/s
INFO:gensim.models.word2vec:PROGRESS: at 15.84% examples, 206607 words/s
INFO:gensim.models.word2vec:PROGRESS: at 21.44% examples, 211151 words/s
INFO:gensim.models.word2vec:PROGRESS: at 27.79% examples, 218849 words/s
INFO:gensim.models.word2vec:PROGRESS: at 34.11% examples, 223197 words/s
INFO:gensim.models.word2vec:PROGRESS: at 39.52% examples, 221748 words/s
INFO:gensim.models.word2vec:PROGRESS: at 44.87% examples, 220961 words/s
INFO:gensim.models.word2vec:PROGRESS: at 50.81% examples, 222592 words/s
INFO:gensim.models.word2vec:PROGRESS: at 56.89% examples, 223652 words/s
INFO:gensim.models.word2vec:PROGRESS: at 63.18% examples, 226027 words/s
INFO:gensim.models.word2vec:PROGRESS: at 68.38% examples, 22460


Loaded corpus with 130818 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 3.09% examples, 224782 words/s
INFO:gensim.models.word2vec:PROGRESS: at 5.40% examples, 201940 words/s
INFO:gensim.models.word2vec:PROGRESS: at 8.18% examples, 201231 words/s
INFO:gensim.models.word2vec:PROGRESS: at 10.71% examples, 201111 words/s
INFO:gensim.models.word2vec:PROGRESS: at 13.48% examples, 202751 words/s
INFO:gensim.models.word2vec:PROGRESS: at 16.41% examples, 208472 words/s
INFO:gensim.models.word2vec:PROGRESS: at 19.15% examples, 211718 words/s
INFO:gensim.models.word2vec:PROGRESS: at 21.67% examples, 214655 words/s
INFO:gensim.models.word2vec:PROGRESS: at 24.42% examples, 218790 words/s
INFO:gensim.models.word2vec:PROGRESS: at 27.78% examples, 223034 words/s
INFO:gensim.models.word2vec:PROGRESS: at 30.58% examples, 221938 words/s
INFO:gensim.models.word2vec:PROGRESS: at 33.92% examples, 223819


Loaded corpus with 360090 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 12.82% examples, 249325 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.72% examples, 251647 words/s
INFO:gensim.models.word2vec:PROGRESS: at 38.77% examples, 252570 words/s
INFO:gensim.models.word2vec:PROGRESS: at 51.60% examples, 252584 words/s
INFO:gensim.models.word2vec:PROGRESS: at 64.68% examples, 253080 words/s
INFO:gensim.models.word2vec:PROGRESS: at 77.47% examples, 252931 words/s
INFO:gensim.models.word2vec:PROGRESS: at 90.56% examples, 253264 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 2066628 raw words took 7.8s, 252943 trained words/s



Loaded corpus with 67057 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 13.67% examples, 228695 words/s
INFO:gensim.models.word2vec:PROGRESS: at 24.34% examples, 222867 words/s
INFO:gensim.models.word2vec:PROGRESS: at 38.36% examples, 227120 words/s
INFO:gensim.models.word2vec:PROGRESS: at 49.29% examples, 226032 words/s
INFO:gensim.models.word2vec:PROGRESS: at 63.31% examples, 227678 words/s
INFO:gensim.models.word2vec:PROGRESS: at 74.13% examples, 226404 words/s
INFO:gensim.models.word2vec:PROGRESS: at 88.05% examples, 227432 words/s
INFO:gensim.models.word2vec:PROGRESS: at 98.82% examples, 226509 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 1993380 raw words took 8.1s, 226404 trained words/s



Loaded corpus with 50114 sentences.


## Classification

In [96]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

KW = pd.read_csv(path.join('data', 'CurrentNews', 'All.csv'), sep='|', encoding='utf-8')
prob = calc_probability(KW, models)
prob = prob.div(prob.sum(axis=1), axis=0)
KW = pd.concat([KW, prob], axis=1)

INFO:gensim.models.word2vec:scoring sentences with 4 workers on 68366 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 6320000.00% sentences, 63166 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 12350000.00% sentences, 61713 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 19480000.00% sentences, 64906 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 26580000.00% sentences, 66416 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 33480000.00% sentences, 66928 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 39530000.00% sentences, 65853 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 46040000.00% sentences, 65734 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 52890000.00% sentences, 66072 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 60380000.00% sentences, 67038 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 67530000.00% sentences, 67480 sentences/s
INFO:gensim.models.word2vec:reac

In [97]:
KW.groupby('site').mean()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berliner Zeitung,0.031824,0.330987,0.296954,0.003738,0.332794,0.003703
Bild,0.121541,0.166916,0.177776,0.182734,0.144807,0.206226
Der Postillon,0.109747,0.261821,0.112888,0.131939,0.133038,0.250568
Deutsche Stimme,0.074891,0.170481,0.085498,0.13091,0.071609,0.46661
FAZ,0.149475,0.179004,0.144852,0.187633,0.140829,0.198206
Focus,0.134077,0.186887,0.145099,0.16921,0.153742,0.210984
Frankfurter Rundschau,0.141775,0.184357,0.150008,0.180776,0.134988,0.208097
Golem,0.23641,0.109015,0.193146,0.198986,0.123087,0.139356
Handelsblatt,0.152741,0.179764,0.171003,0.180354,0.147345,0.168793
Heise,0.227769,0.130647,0.192854,0.170791,0.13866,0.139278
