In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

from os import path
import numpy as np
import pandas as pd
from random import shuffle
from corputil import FileCorpus
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
def label_data(files, label):
    sentences = [LabeledSentence(sentence, ['{}_{}'.format(label, i)]) 
                     for i, sentence in enumerate([s for doc in FileCorpus(files).doc_sentences_token() 
                                                     for s in doc])]
    count = len(sentences)
    return sentences, count

In [None]:
spd = [
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Inland.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_International.txt'),
    path.join('data', 'Politics', 'SPD_Vorwärts_Parteileben.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_PR.txt'),
    path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt'),
    path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
    path.join('data', 'Politics', 'NPD_MV.txt'),
    path.join('data', 'Politics', 'NPD_Sachsen.txt'),
    path.join('data', 'Politics', 'NPD_Jung.txt')
]

files = [spd, linke, gruene, fdp, cdu, npd]
tags = ['SPD', 'LINKE', 'GRUENE', 'FDP', 'CDU', 'NPD']

In [None]:
sentences = []
counts = dict()
for file, tag in zip(files, tags):
    s, count = label_data(file, tag)
    sentences += s
    counts[tag] = count

In [None]:
model = Doc2Vec(min_count=1, window=10, size=200, sample=1e-4, negative=5, workers=4)
model.build_vocab(sentences)

In [None]:
for epoch in range(10):
    shuffle(sentences)
    model.train(sentences)

## Classifier

In [None]:
mapping = {
    'SPD': 0,
    'LINKE': 1,
    'GRUENE': 2,
    'FDP': 3,
    'CDU': 4,
    'NPD': 5
}

index = 0
train_arrays = np.zeros((len(sentences), 200))
train_labels = np.zeros(len(sentences))
for tag in tags:
    for i in range(counts[tag]):
        label = '{}_{}'.format(tag, i)
        train_arrays[index + i] = model.docvecs[label]
        train_labels[index + i] = mapping[tag]
    index += counts[tag]

In [None]:
comb = [(vec, lab) for vec, lab in zip(train_arrays, train_labels)]
shuffle(comb)
train_arrays = [vec for vec, lab in comb[:800000]]
train_labels = [lab for vec, lab in comb[:800000]]
test_arrays = [vec for vec, lab in comb[800000:]]
test_labels = [lab for vec, lab in comb[800000:]]
comb = None

In [None]:
classifier = LinearSVC()
classifier.fit(train_arrays, train_labels)

In [None]:
classifier.score(train_arrays, train_labels)