# Calculate Political Opinion Models

In [None]:
import logging
from os import path
from corputil import FileCorpus
from gensim.models.word2vec import LineSentence, Word2Vec

## Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.

In [None]:
# sentences = LineSentence(path.join('data', 'Corpus_Wiki.txt'))
# base = Word2Vec(sentences, workers=4)

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [None]:
# base.save(path.join('models', 'base.w2v'))
# base = None

## Training the Classifier

Train model for Die Linke.
Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Corpus_Linke.txt')
file2 = path.join('data', 'Corpus_Linke_PR.txt')
file3 = path.join('data', 'Corpus_Linke_Fraktion.txt')
corpus = list(FileCorpus(file1, file2, file3).sentences_token())

linke = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
linke.train(corpus, total_examples=len(corpus))
# linke.init_sims(replace=True) Doesn't work for now!
linke.save(path.join('models', 'word2vec', 'Linke.w2v'))
linke = None

Train model for SPD.
Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Corpus_SPD_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

spd = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
spd.train(corpus, total_examples=len(corpus))
# spd.init_sims(replace=True) Doesn't work for now!
spd.save(path.join('models', 'word2vec', 'SPD.w2v'))
spd = None

Train model for Die Grünen.
Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Corpus_Grüne_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

gruene = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
gruene.train(corpus, total_examples=len(corpus))
# gruene.init_sims(replace=True) Doesn't work for now!
gruene.save(path.join('models', 'word2vec', 'Grüne.w2v'))
gruene = None

Train model for FDP. Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Corpus_FDP.txt')
file2 = path.join('data', 'Corpus_FDP_Fraktion.txt')
corpus = list(FileCorpus(file1, file2).sentences_token())

fdp = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
fdp.train(corpus, total_examples=len(corpus))
# fdp.init_sims(replace=True) Doesn't work for now!
fdp.save(path.join('models', 'word2vec', 'FDP.w2v'))
fdp = None

Train model for CDU. Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Corpus_CDU_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

cdu = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
cdu.train(corpus, total_examples=len(corpus))
# cdu.init_sims(replace=True) Doesn't work for now!
cdu.save(path.join('models', 'word2vec', 'CDU.w2v'))
cdu = None

Train model for NPD. Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Corpus_NPD_MV.txt')
file2 = path.join('data', 'Corpus_NPD_Jung.txt')
corpus = list(FileCorpus(file1, file2).sentences_token())

npd = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
npd.train(corpus, total_examples=len(corpus))
# npd.init_sims(replace=True) Doesn't work for now!
npd.save(path.join('models', 'word2vec', 'NPD.w2v'))
npd = None

Train model for AFD. Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Corpus_AFD_PR.txt')
corpus = list(FileCorpus(file).sentences_token())

afd = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
afd.train(corpus, total_examples=len(corpus))
# afd.init_sims(replace=True) Doesn't work for now!
afd.save(path.join('models', 'word2vec', 'AFD.w2v'))
afd = None

## Classification

Load models and documents into memory.

In [1]:
from os import path
from corputil import FileCorpus
from gensim.models.word2vec import Word2Vec

In [2]:
SPD = Word2Vec.load(path.join('models', 'word2vec', 'SPD.w2v'))
Linke = Word2Vec.load(path.join('models', 'word2vec', 'Linke.w2v'))
Gruene = Word2Vec.load(path.join('models', 'word2vec', 'Grüne.w2v'))
FDP = Word2Vec.load(path.join('models', 'word2vec', 'FDP.w2v'))
CDU = Word2Vec.load(path.join('models', 'word2vec', 'CDU.w2v'))
# NPD = Word2Vec.load(path.join('models', 'word2vec', 'NPD.w2v'))
# AFD = Word2Vec.load(path.join('models', 'word2vec', 'AFD.w2v'))

models = [SPD, Linke, Gruene, FDP, CDU]

In [10]:
import pandas as pd
import numpy as np

def calc_probability(docs, mods):
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    prob = prob.sum()
    return prob/prob.sum()

# site = 'Spiegel'
# labels = ['KW44', 'KW45', 'KW46', 'KW47', 'KW48', 'KW49']
# files = [path.join('data', 'Corpus_{}_{}.txt'.format(label, site)) for label in labels]
# for file in files:
#     corpus = list(FileCorpus(file).doc_sentences_token())
#     print(calc_probability(corpus, models))

file = path.join('data', 'Test.txt')
corpus = list(FileCorpus(file).doc_sentences_token())
print(calc_probability(corpus, models))

0    0.384260
1    0.202781
2    0.230964
3    0.167671
4    0.014324
dtype: float32
