# Calculate Political Opinion Models

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
from corputil import FileCorpus
from gensim.models.word2vec import LineSentence, Word2Vec

INFO:gensim.utils:detected Windows; aliasing chunkize to chunkize_serial
INFO:gensim.corpora.sharded_corpus:Could not import Theano, will use standard float for default ShardedCorpus dtype.
INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


## Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.

In [None]:
sentences = LineSentence(path.join('data', 'Archive', 'Corpus_Wiki.txt'))
base = Word2Vec(sentences, workers=4, size=300, window=10, min_count=5)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 278404 words, keeping 55121 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 514407 words, keeping 84865 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 743481 words, keeping 106543 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 969758 words, keeping 125020 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 1201238 words, keeping 142250 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 1432102 words, keeping 156874 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 1662339 words, keeping 172236 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 1905723 words, 

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [None]:
base.save(path.join('models', 'word2vec', 'base.w2v'))
base = None

## Training the Classifier

Train model for Die Linke.
Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Politics', 'Corpus_Linke.txt')
file2 = path.join('data', 'Politics', 'Corpus_Linke_PR.txt')
file3 = path.join('data', 'Politics', 'Corpus_Linke_Fraktion.txt')
corpus = list(FileCorpus(file1, file2, file3).sentences_token(stopwords='german'))

linke = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
linke.train(corpus, total_examples=len(corpus))
# linke.init_sims(replace=True) Doesn't work for now!
linke.save(path.join('models', 'word2vec', 'Linke.w2v'))
linke = None

Train model for SPD.
Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Politics', 'Corpus_SPD_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token(stopwords='german'))

spd = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
spd.train(corpus, total_examples=len(corpus))
# spd.init_sims(replace=True) Doesn't work for now!
spd.save(path.join('models', 'word2vec', 'SPD.w2v'))
spd = None

Train model for Die Grünen.
Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Politics', 'Corpus_Grüne_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token(stopwords='german'))

gruene = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
gruene.train(corpus, total_examples=len(corpus))
# gruene.init_sims(replace=True) Doesn't work for now!
gruene.save(path.join('models', 'word2vec', 'Grüne.w2v'))
gruene = None

Train model for FDP. Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Politics', 'Corpus_FDP.txt')
file2 = path.join('data', 'Politics', 'Corpus_FDP_Fraktion.txt')
corpus = list(FileCorpus(file1, file2).sentences_token(stopwords='german'))

fdp = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
fdp.train(corpus, total_examples=len(corpus))
# fdp.init_sims(replace=True) Doesn't work for now!
fdp.save(path.join('models', 'word2vec', 'FDP.w2v'))
fdp = None

Train model for CDU. Model is finalized to save RAM.

In [None]:
file = path.join('data', 'Politics', 'Corpus_CDU_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token(stopwords='german'))

cdu = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
cdu.train(corpus, total_examples=len(corpus))
# cdu.init_sims(replace=True) Doesn't work for now!
cdu.save(path.join('models', 'word2vec', 'CDU.w2v'))
cdu = None

Train model for NPD. Model is finalized to save RAM.

In [None]:
file1 = path.join('data', 'Politics', 'Corpus_NPD_MV.txt')
file2 = path.join('data', 'Politics', 'Corpus_NPD_Sachsen.txt')
file3 = path.join('data', 'Politics', 'Corpus_NPD_Jung.txt')
corpus = list(FileCorpus(file1, file2, file3).sentences_token(stopwords='german'))

npd = Word2Vec.load(path.join('models', 'word2vec', 'base.w2v'))
npd.train(corpus, total_examples=len(corpus))
# npd.init_sims(replace=True) Doesn't work for now!
npd.save(path.join('models', 'word2vec', 'NPD.w2v'))
npd = None

## Classification

Load models and documents into memory.

In [None]:
from os import path
from corputil import FileCorpus
from gensim.models.word2vec import Word2Vec

In [None]:
SPD = Word2Vec.load(path.join('models', 'word2vec', 'SPD.w2v'))
Linke = Word2Vec.load(path.join('models', 'word2vec', 'Linke.w2v'))
Gruene = Word2Vec.load(path.join('models', 'word2vec', 'Grüne.w2v'))
FDP = Word2Vec.load(path.join('models', 'word2vec', 'FDP.w2v'))
CDU = Word2Vec.load(path.join('models', 'word2vec', 'CDU.w2v'))
NPD = Word2Vec.load(path.join('models', 'word2vec', 'NPD.w2v'))

models = [SPD, Linke, Gruene, FDP, CDU, NPD]

In [None]:
import pandas as pd
import numpy as np

def calc_probability(docs, mods):
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    prob = prob.sum()
    return prob/prob.sum()

file = path.join('data', 'Test.txt')
corpus = list(FileCorpus(file).doc_sentences_token())
print(calc_probability(corpus, models))