# Calculate Political Opinion Models

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

INFO:gensim.utils:detected Windows; aliasing chunkize to chunkize_serial
INFO:gensim.corpora.sharded_corpus:Could not import Theano, will use standard float for default ShardedCorpus dtype.
INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [2]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt')
#    path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt')
#     path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt')
#     path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
    path.join('data', 'Politics', 'NPD_MV.txt')
#     path.join('data', 'Politics', 'NPD_Sachsen.txt'),
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]

parties = [
    'Linke',
    'SPD',
    'Gruene',
    'FDP',
    'CDU',
    'NPD'
]

## Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.

In [3]:
sentences = LineSentence(path.join('data', 'Archive', 'Cropped_Wiki.txt'))
base = Word2Vec(sentences, workers=4, iter=4, size=200, window=2, min_count=5)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 152624 words, keeping 54082 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 274968 words, keeping 83779 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 390688 words, keeping 105425 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 504388 words, keeping 123888 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 621422 words, keeping 141102 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 736469 words, keeping 155724 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 850847 words, keeping 171064 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 974518 words, keep

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [4]:
base.save(path.join('models', 'word2vec', 'Base.w2v'))
base = None
sentences = None

INFO:gensim.utils:saving Word2Vec object under models\word2vec\Base.w2v, separately None
INFO:gensim.utils:storing numpy array 'syn0' to models\word2vec\Base.w2v.syn0.npy
INFO:gensim.utils:storing numpy array 'syn1' to models\word2vec\Base.w2v.syn1.npy
INFO:gensim.utils:not storing attribute syn0norm
INFO:gensim.utils:not storing attribute cum_table


## Training the Classifier

In [5]:
for party, corpus in zip(parties, corpora):
    sentences = list(corpus.sentences_token(stopwords=stopwords))
    shuffle(sentences)
    model = Word2Vec.load(path.join('models', 'word2vec', 'Base.w2v'))
    model.train(sentences, total_examples=len(sentences))
    model.save(path.join('models', 'word2vec', '{}.w2v'.format(party)))

INFO:gensim.utils:loading Word2Vec object from models\word2vec\Base.w2v
INFO:gensim.utils:loading syn0 from models\word2vec\Base.w2v.syn0.npy with mmap=None
INFO:gensim.utils:loading syn1 from models\word2vec\Base.w2v.syn1.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.models.word2vec:training model with 4 workers on 2156505 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 16.11% examples, 300799 words/s
INFO:gensim.models.word2vec:PROGRESS: at 33.02% examples, 307365 words/s
INFO:gensim.models.word2vec:PROGRESS: at 50.50% examples, 312727 words/s
INFO:gensim.models.word2vec:PROGRESS: at 67.27% examples, 312398 words/s
INFO:gensim.models.word2vec:PROGRESS: at 84.68% examples, 314845 words/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 11 outstanding jobs
INFO:gensim.models.word2vec:training on 1903

## Political Ideology Detection

Load models and documents into memory.

In [6]:
models = [path.join('models', 'word2vec', '{}.w2v'.format(party)) for party in parties]

In [8]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ calc_score(sentlist, m) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

KW = pd.read_csv(path.join('data', 'CurrentNews', '2016KW01.csv'), sep='|', encoding='utf-8')
prob = calc_probability(KW, models)
# prob = prob.sub((100 / len(models)), axis=0)
KW = pd.concat([KW, prob], axis=1)

INFO:gensim.utils:loading Word2Vec object from models\word2vec\Linke.w2v
INFO:gensim.utils:loading syn0 from models\word2vec\Linke.w2v.syn0.npy with mmap=None
INFO:gensim.utils:loading syn1 from models\word2vec\Linke.w2v.syn1.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.models.word2vec:scoring sentences with 4 workers on 2156505 vocabulary and 200 features, using sg=1 hs=1 sample=0 and negative=0
INFO:gensim.models.word2vec:PROGRESS: at 1210000.00% sentences, 12037 sentences/s
INFO:gensim.models.word2vec:PROGRESS: at 4050000.00% sentences, 20152 sentences/s
INFO:gensim.models.word2vec:reached end of input; waiting to finish 12 outstanding jobs
INFO:gensim.models.word2vec:PROGRESS: at 7810000.00% sentences, 25942 sentences/s
INFO:gensim.models.word2vec:scoring 78452 sentences took 3.0s, 25951 sentences/s
INFO:gensim.utils:loading Word2Vec object from models\word2vec\SPD.w2v
INFO:

In [10]:
KW.groupby('site').median()

Unnamed: 0_level_0,0,1,2,3,4,5
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bild,0.141457,0.126284,0.212281,0.107261,0.154974,0.092138
Der Postillon,0.201776,0.148521,0.193378,0.13081,0.09999,0.067246
Deutsche Stimme,0.219374,0.0584,0.112183,0.112963,0.166319,0.277911
FAZ,0.176621,0.145568,0.169291,0.14009,0.122568,0.136129
Focus,0.174534,0.134614,0.157948,0.151015,0.11987,0.135142
Frankfurter Rundschau,0.145506,0.146436,0.183621,0.132461,0.113681,0.121912
Golem,0.07763,0.367036,0.19387,0.076546,0.131123,0.099191
Handelsblatt,0.162933,0.121371,0.202093,0.120124,0.127561,0.116924
Heise,0.068232,0.250945,0.19881,0.113106,0.133286,0.122778
Huffington Post,0.18192,0.145733,0.166875,0.166914,0.124771,0.157203
