# Calculate Political Opinion Models

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

INFO:gensim.utils:detected Windows; aliasing chunkize to chunkize_serial
INFO:gensim.corpora.sharded_corpus:Could not import Theano, will use standard float for default ShardedCorpus dtype.
INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [2]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
#     path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt')
#     path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt'),
    path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
     path.join('data', 'Politics', 'NPD_MV.txt'),
#    path.join('data', 'Politics', 'NPD_Sachsen.txt')
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [spd, linke, gruene, fdp, cdu, npd] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=4, size=200, window=10)
base.build_vocab(base_corpus)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 82452 words, keeping 18390 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 168831 words, keeping 27968 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 244315 words, keeping 36920 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 316825 words, keeping 43516 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 395523 words, keeping 50603 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 479198 words, keeping 56991 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 560202 words, keeping 62126 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 641777 words, keeping 66

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [3]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [4]:
from copy import deepcopy

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    shuffle(sentences)
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 38.86% examples, 276487 words/s
INFO:gensim.models.word2vec:PROGRESS: at 78.45% examples, 278216 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 749636 raw words took 2.5s, 278882 trained words/s
INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 6.48% examples, 280777 words/s
INFO:gensim.models.word2vec:PROGRESS: at 13.05% examples, 283775 words/s
INFO:gensim.models.word2vec:PROGRESS: at 19.52% examples, 282744 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.40% examples, 276200 words/s
INFO:gensim.models.word2vec:PROGRESS: at 31.95% examples, 277546 words/s
INFO:gensim.models.word2vec:PROGRESS: a

Loaded corpus with 24315 sentences.
Loaded corpus with 148844 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 6.99% examples, 276882 words/s
INFO:gensim.models.word2vec:PROGRESS: at 14.12% examples, 279342 words/s
INFO:gensim.models.word2vec:PROGRESS: at 21.31% examples, 280033 words/s
INFO:gensim.models.word2vec:PROGRESS: at 28.49% examples, 280255 words/s
INFO:gensim.models.word2vec:PROGRESS: at 35.66% examples, 280946 words/s
INFO:gensim.models.word2vec:PROGRESS: at 42.77% examples, 280698 words/s
INFO:gensim.models.word2vec:PROGRESS: at 49.67% examples, 279217 words/s
INFO:gensim.models.word2vec:PROGRESS: at 56.57% examples, 278404 words/s
INFO:gensim.models.word2vec:PROGRESS: at 62.45% examples, 273319 words/s
INFO:gensim.models.word2vec:PROGRESS: at 69.14% examples, 272181 words/s
INFO:gensim.models.word2vec:PROGRESS: at 75.56% examples, 270241 words/s
INFO:gensim.models.word2vec:PROGRESS: at 81.79% examples, 2683


Loaded corpus with 130818 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 3.62% examples, 293802 words/s
INFO:gensim.models.word2vec:PROGRESS: at 7.32% examples, 296032 words/s
INFO:gensim.models.word2vec:PROGRESS: at 11.05% examples, 297961 words/s
INFO:gensim.models.word2vec:PROGRESS: at 14.71% examples, 297286 words/s
INFO:gensim.models.word2vec:PROGRESS: at 18.44% examples, 297978 words/s
INFO:gensim.models.word2vec:PROGRESS: at 22.08% examples, 297714 words/s
INFO:gensim.models.word2vec:PROGRESS: at 25.76% examples, 297783 words/s
INFO:gensim.models.word2vec:PROGRESS: at 29.26% examples, 295972 words/s
INFO:gensim.models.word2vec:PROGRESS: at 32.78% examples, 294664 words/s
INFO:gensim.models.word2vec:PROGRESS: at 36.46% examples, 294802 words/s
INFO:gensim.models.word2vec:PROGRESS: at 40.00% examples, 294108 words/s
INFO:gensim.models.word2vec:PROGRESS: at 43.50% examples, 29316


Loaded corpus with 360090 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 14.47% examples, 283285 words/s
INFO:gensim.models.word2vec:PROGRESS: at 29.19% examples, 285826 words/s
INFO:gensim.models.word2vec:PROGRESS: at 43.96% examples, 286557 words/s
INFO:gensim.models.word2vec:PROGRESS: at 58.68% examples, 287063 words/s
INFO:gensim.models.word2vec:PROGRESS: at 73.44% examples, 287215 words/s
INFO:gensim.models.word2vec:PROGRESS: at 88.32% examples, 287944 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 2066628 raw words took 6.8s, 287823 trained words/s



Loaded corpus with 67057 sentences.

INFO:gensim.models.word2vec:training model with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 44.12% examples, 266928 words/s
INFO:gensim.models.word2vec:PROGRESS: at 90.01% examples, 271696 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:training on 662088 raw words took 2.2s, 270954 trained words/s



Loaded corpus with 19776 sentences.


## Classification

In [5]:
labels = ['2015KW44', '2015KW45', '2015KW46', '2015KW47', '2015KW48', '2015KW49', '2015KW50', '2015KW51']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
output = [path.join('data', 'CurrentNews', 'Sentiment_{}.csv').format(label) for label in labels]

In [8]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

def process(data):
    sentiment = calc_probability(data, models)
    return sentiment

# KW = pd.read_csv(path.join('data', 'CurrentNews', 'All.csv'), sep='|', encoding='utf-8')
# prob = calc_probability(KW, models)
# prob = prob.div(prob.sum(axis=1), axis=0)
# KW = pd.concat([KW, prob], axis=1)

for file, out in zip(files, output):
    data = pd.read_csv(file, sep='|', encoding='utf-8')
    sentiment = process(data)
    csv = pd.concat([data, sentiment], axis=1)
    csv.rename(columns={ 0: 'Linke', 1: 'SPD', 2: 'Gruene', 3: 'FDP', 4: 'CDU', 5: 'NPD'  }, inplace=True)
    csv.to_csv(out, index=False, encoding='utf-8', sep='|')

INFO:gensim.models.word2vec:scoring sentences with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.9s, 82723 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.8s, 86434 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 64721 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:scoring 72146 sentences took 0.8s, 84911 sentences/s
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 64721 vocabu