In [None]:
from os import path
from corputil import FileCorpus
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
files = [
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt')]
tags = [
    'Politik', 'Wirtschaft', 'Wissenschaft', 'Auto', 'Bildung',
    'Geschichte', 'Kultur', 'Panorama', 'Reise', 'Sport', 'Technik'
]

In [None]:
training = dict()
training['documents'] = []
training['tags'] = []
for file, tag in zip(files, tags):
    for doc in FileCorpus(file).doc_token(stopwords='german'):
        training['documents'].append(' '.join(doc))
        training['tags'].append(tag)

vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
tfidf = vectorizer.fit_transform(training['documents'])

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=11, n_jobs=4)
km.fit(tfidf)

In [None]:
points = km.transform(tfidf)
labels = km.labels_

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced = pca.fit(points).transform(points)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(reduced)
df['tags'] = training['tags']
plt.scatter(df[0], df[1], c=labels)
plt.show()