# Calculate Political Opinion Models

In [None]:
import logging
from os import path

## Training the Base Model

Calculate the base model (empty), that is later used as a base for training the classification models.

In [None]:
from corputil import FileCorpus
from gensim.models.word2vec import Word2Vec

file1 = path.join('data', 'Corpus_SPD_Fraktion.txt')
file2 = path.join('data', 'Corpus_Linke_Fraktion.txt')
file3 = path.join('data', 'Corpus_Grüne_Fraktion.txt')
file4 = path.join('data', 'Corpus_FDP_Fraktion.txt')
file5 = path.join('data', 'Corpus_CDU_Fraktion.txt')
file6 = path.join('data', 'Corpus_NPD_Jung.txt')
file7 = path.join('data', 'Corpus_AFD_PR.txt')

base_corpus = list(FileCorpus(file1, file2, file3, file4, file5, file6, file7).sentences_token())
print(len(base_corpus))
base = Word2Vec(workers=4, iter=4)
base.build_vocab(base_corpus)

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [None]:
# base.save(path.join('models', 'base.w2v'))

## Training

In [None]:
from copy import deepcopy

# SPD, Linke, Gruene, FDP, CDU, NPD, AFD
corpora = [FileCorpus(file1), FileCorpus(file2), FileCorpus(file3), FileCorpus(file4), 
           FileCorpus(file5), FileCorpus(file6), FileCorpus(file7)]
models = [deepcopy(base) for i in range(len(corpora))]

for i in range(len(corpora)):
    sentences = list(corpora[i].sentences_token())
    models[i].train(sentences, total_examples=len(sentences))
    print('Loaded corpus with {} sentences.'.format(len(sentences)))

## Classification

In [None]:
import pandas as pd
import numpy as np

def calc_probability(docs, mods):
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

test_corpus = list(FileCorpus(path.join('data', 'Corpus_KW44.txt')).doc_sentences_token())
prob = calc_probability(test_corpus, models)

In [None]:
np.sum(prob)