# Calculate Political Opinion Models

In [None]:
import logging
from os import path

## Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.

In [None]:
from gensim.models.word2vec import LineSentence, Word2Vec

sentences = LineSentence(path.join('data', 'Corpus_Wiki.txt'))
base = Word2Vec(sentences, workers=4)

Save model to disk. Don't finalize the model because we need to train it with new data later!

In [None]:
base.save(path.join('models', 'base.w2v'))

## Training the Classifier

Train model for Die Linke.
Model is finalized to save RAM.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus

file = path.join('data', 'Corpus_Linke_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

linke = Word2Vec.load(path.join('models', 'base.w2v'))
linke.train(corpus, total_examples=len(corpus))
# linke.init_sims(replace=True) Doesn't work for now!
# linke.save(path.join('models', 'Linke.w2v'))

Train model for SPD.
Model is finalized to save RAM.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus

file = path.join('data', 'Corpus_SPD_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

spd = Word2Vec.load(path.join('models', 'base.w2v'))
spd.train(corpus, total_examples=len(corpus))
# spd.init_sims(replace=True) Doesn't work for now!
# spd.save(path.join('models', 'SPD.w2v'))

Train model for Die Grünen.
Model is finalized to save RAM.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus

file = path.join('data', 'Corpus_Grüne_Fraktion.txt')
corpus = list(FileCorpus(file).sentences_token())

gruene = Word2Vec.load(path.join('models', 'base.w2v'))
gruene.train(corpus, total_examples=len(corpus))
# gruene.init_sims(replace=True) Doesn't work for now!
# gruene.save(path.join('models', 'Grüne.w2v'))

## Classification

Load models and documents into memory.

In [1]:
from os import path
from corputil import FileCorpus
from gensim.models.word2vec import Word2Vec

In [2]:
base = Word2Vec.load(path.join('models', 'base.w2v'))
SPD = Word2Vec.load(path.join('models', 'SPD.w2v'))
Linke = Word2Vec.load(path.join('models', 'Linke.w2v'))
Grüne = Word2Vec.load(path.join('models', 'Grüne.w2v'))

In [3]:
import pandas as pd
import numpy as np

"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

corpus = FileCorpus(path.join('data', 'Test.txt'))
corpus = list(corpus.doc_sentences_token())

probs = docprob(corpus, [base, SPD, Linke, Grüne])
print(probs)

            0             1         2         3
doc                                            
0    0.494838  5.912009e-02  0.412838  0.033204
1    0.476397  9.720161e-02  0.422417  0.003985
2    0.999931  9.197360e-21  0.000057  0.000012
3    0.001713  5.634401e-05  0.663362  0.334869
4    0.000002  2.614768e-02  0.668174  0.305677
