# Calculate Political Opinion Models

In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
# logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

stopwords = load_stopwords(path.join('data', 'german.txt'))

## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [2]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
#     path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt')
#     path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
#     path.join('data', 'Politics', 'Grüne.txt')
   path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
#     path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt')
#     path.join('data', 'Politics', 'CDU_EU.txt')
#     path.join('data', 'Politics', 'CDU_Fraktion.txt'),
]

npd = [
    path.join('data', 'Politics', 'NPD_MV.txt'),
    path.join('data', 'Politics', 'NPD_Sachsen.txt')
#     path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [file for fp in [spd, linke, gruene, fdp, cdu, npd] for file in fp]

base_corpus = list(FileCorpus(files).sentences_token(stopwords=stopwords))
base = Word2Vec(workers=4, iter=6, size=100, window=3)
base.build_vocab(base_corpus)

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [3]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [4]:
from copy import deepcopy

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token(stopwords=stopwords))
    shuffle(sentences)
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

Loaded corpus with 69350 sentences.
Loaded corpus with 148844 sentences.
Loaded corpus with 119226 sentences.
Loaded corpus with 314955 sentences.
Loaded corpus with 2790 sentences.
Loaded corpus with 50114 sentences.


## Classification

In [5]:
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', 
          '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]

In [11]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    print(llhd[0])
    print(len(llhd[0]))
    print(len(sentlist))
    print('###############')
    lhd = np.exp(llhd - llhd.max(axis=0))
    print(lhd[0])
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

raw = pd.concat([pd.read_csv(file, sep='|', encoding='utf-8') for file in files], ignore_index=True)
prob = calc_probability(raw, models)
data = pd.concat([raw, prob], axis=1)

# for file in files:
#     data = pd.read_csv(file, sep='|', encoding='utf-8')
#     sentiment = process(data)
#     csv = pd.concat([data, sentiment], axis=1)
#     csv.rename(columns={ 0: 'Linke', 1: 'SPD', 2: 'Gruene', 3: 'FDP', 4: 'CDU', 5: 'NPD'  }, inplace=True)
#     csv.to_csv(file, index=False, encoding='utf-8', sep='|')

[-542.8927002   -23.33567047 -243.40457153 ..., -478.42745972 -548.00463867
  -62.51273346]
990267
990267
###############
[  1.01652554e-10   5.14200449e-01   6.43656300e-17 ...,   1.00000000e+00
   2.00483640e-11   5.56589924e-02]


In [7]:
# data.groupby('site').mean()

In [8]:
# data.groupby('site').median()

In [114]:
def get_score(df, index):
    df['max'] = df.max(axis=1)
    return (len(df[df['max'] == df[index]])) / (len(df) / 100)

def calc_probability_list(corp, mods):
    docs = list(FileCorpus(corp).doc_sentences_token(stopwords=stopwords))
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

In [115]:
linke_test = [
    path.join('data', 'Politics', 'Linke_Fraktion.txt')
]
prob = calc_probability_list(linke_test, models)
print('Linke: {}'.format(get_score(prob, 0)))

spd_test = [
    path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]
prob = calc_probability_list(spd_test, models)
print('SPD: {}'.format(get_score(prob, 1)))

gruene_test = [
    path.join('data', 'Politics', 'Grüne.txt')
]
prob = calc_probability_list(gruene_test, models)
print('Gruene: {}'.format(get_score(prob, 2)))

fdp_test = [
    path.join('data', 'Politics', 'FDP.txt')
]
prob = calc_probability_list(fdp_test, models)
print('FDP: {}'.format(get_score(prob, 3)))

cdu_test = [
    path.join('data', 'Politics', 'CDU_EU.txt')
]
prob = calc_probability_list(cdu_test, models)
print('CDU: {}'.format(get_score(prob, 4)))

prob = calc_probability(raw[raw['site'] == 'Deutsche Stimme'], models)
print('NPD: {}'.format(get_score(prob, 5)))

Linke: 38.40846366145354
SPD: 41.78999295278365
Gruene: 44.36826640548482
FDP: 99.92962702322308
CDU: 11.213626685592619
NPD: 72.88135593220339


In [None]:
Linke: 39.46642134314627
SPD: 39.93422598073761
Gruene: 46.5230166503428
FDP: 99.92962702322308
CDU: 16.25266146202981
NPD: 74.57627118644068