# Calculate Political Opinion Models

In [1]:
import logging
from os import path

## Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.

In [None]:
from gensim.models.word2vec import LineSentence, Word2Vec

sentences = LineSentence(path.join('data', 'Corpus_Wiki.txt'))
base = Word2Vec(sentences, workers=4)

Calculate the base model (empty), that is later used as a base for training the classification models.

In [None]:
from gensim.models.word2vec import LineSentence, Word2Vec

base = Word2Vec()

Save model to disk.

In [None]:
base.save(path.join('models', 'base.w2v'))

## Training the Classifier

Train model for Die Linke.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus
from corputil.modifiers import sentence_to_words

file = path.join('data', 'Corpus_DieLinke_Fraktion.txt')
corpus = list(FileCorpus(file, modifier=sentence_to_words))

base = Word2Vec.load(path.join('models', 'base.w2v'))
base.train(corpus, total_examples=len(corpus))
base.save(path.join('models', 'Linke.w2v'))

Train model for SPD.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus
from corputil.modifiers import sentence_to_words

file = path.join('data', 'Corpus_SPD_Fraktion.txt')
corpus = list(FileCorpus(file, modifier=sentence_to_words))

base = Word2Vec.load(path.join('models', 'base.w2v'))
base.train(corpus, total_examples=len(corpus))
base.save(path.join('models', 'SPD.w2v'))

Train model for Die Grünen.

In [None]:
from gensim.models.word2vec import Word2Vec
from corputil import FileCorpus
from corputil.modifiers import sentence_to_words

file = path.join('data', 'Corpus_Grüne_Fraktion.txt')
corpus = list(FileCorpus(file, modifier=sentence_to_words))

base = Word2Vec.load(path.join('models', 'base.w2v'))
base.train(corpus, total_examples=len(corpus))
base.save(path.join('models', 'Grüne.w2v'))

## Classification

In [None]:
import pandas as pd

"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob