In [1]:
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import matplotlib.pyplot as plt

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [None]:
files = [
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt'),
    path.join('data', 'Archive', 'Stern_Politik.txt'),
    path.join('data', 'Archive', 'Stern_Panorama.txt'),
    path.join('data', 'Archive', 'Stern_Wirtschaft.txt')]
tags = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

In [None]:
X, y = [], []
for file, tag in zip(files, tags):
    for doc in FileCorpus(file).doc_token(stopwords=stopwords):
        X.append(' '.join(doc))
        y.append(tag)

df = pd.DataFrame()
df['doc'] = X
df['tag'] = y

In [None]:
df = df.iloc[np.random.permutation(len(df))]

In [None]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
vectorizer.fit(df['doc'])

In [None]:
training = df[:550000]
test = df[550000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])

In [None]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])

In [None]:
classifier.score(test_tfidf, test['tag'])

In [None]:
joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))