In [1]:
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import matplotlib.pyplot as plt

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [2]:
files = [
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt'),
    path.join('data', 'Archive', 'Stern_Politik.txt'),
    path.join('data', 'Archive', 'Stern_Panorama.txt'),
    path.join('data', 'Archive', 'Stern_Wirtschaft.txt')]
tags = ['Politik', 'Wirtschaft', 'Wissenschaft', 'Auto', 'Bildung', 'Geschichte', 
        'Kultur', 'Panorama', 'Reise', 'Sport', 'Technik', 'Politik', 'Panorama', 'Wirtschaft']
colors = {'Politik': 'red', 
          'Wirtschaft': 'blue', 
          'Wissenschaft': 'cyan', 
          'Auto': 'yellow', 
          'Bildung': 'green', 
          'Geschichte': 'grey', 
          'Kultur': 'purple', 
          'Panorama': 'orange', 
          'Reise': 'lime', 
          'Sport': 'brown', 
          'Technik': 'black'}

In [3]:
X, y = [], []
for file, tag in zip(files, tags):
    for doc in FileCorpus(file).doc_token(stopwords=stopwords):
        X.append(' '.join(doc))
        y.append(tag)

df = pd.DataFrame()
df['doc'] = X
df['tag'] = y

In [4]:
df = df.iloc[np.random.permutation(len(df))]

In [5]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
vectorizer.fit(df['doc'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
training = df[:550000]
test = df[550000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])

In [7]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
classifier.score(test_tfidf, test['tag'])

0.86578230898453334

In [9]:
# joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
# joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))

In [10]:
KW44 = FileCorpus(path.join('data', 'CurrentNews', '2015KW50.txt'))
KW44 = list(KW44.doc_token(stopwords=stopwords))

In [11]:
nX = []
for doc in KW44:
    nX.append(' '.join(doc))

KW44 = None

tfidf = vectorizer.transform(nX)
labels = list(classifier.predict(tfidf))

In [12]:
prep = TruncatedSVD(n_components=50, random_state=0).fit_transform(tfidf)
reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(prep)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 7274
[t-SNE] Computed conditional probabilities for sample 2000 / 7274
[t-SNE] Computed conditional probabilities for sample 3000 / 7274
[t-SNE] Computed conditional probabilities for sample 4000 / 7274
[t-SNE] Computed conditional probabilities for sample 5000 / 7274
[t-SNE] Computed conditional probabilities for sample 6000 / 7274
[t-SNE] Computed conditional probabilities for sample 7000 / 7274
[t-SNE] Computed conditional probabilities for sample 7274 / 7274
[t-SNE] Mean sigma: 0.000000
[t-SNE] Iteration 25: error = 1.4133631, gradient norm = 0.0075588
[t-SNE] Iteration 50: error = 1.3904482, gradient norm = 0.0081721
[t-SNE] Iteration 75: error = 1.2485454, gradient norm = 0.0028465
[t-SNE] Iteration 100: error = 1.2102959, gradient norm = 0.0024277
[t-SNE] Error after 100 iterations with early exaggeration: 1.210296
[t-SNE] Iteration 125:

In [13]:
ndf = pd.DataFrame(reduced)
ndf['tag'] = labels

In [14]:
stuff = [
    ndf.loc[ndf['tag'] == 'Politik'],
    ndf.loc[ndf['tag'] == 'Wirtschaft'],
    ndf.loc[ndf['tag'] == 'Wissenschaft'],
    ndf.loc[ndf['tag'] == 'Auto'],
    ndf.loc[ndf['tag'] == 'Bildung'],
    ndf.loc[ndf['tag'] == 'Geschichte'],
    ndf.loc[ndf['tag'] == 'Reise'],
    ndf.loc[ndf['tag'] == 'Sport'],
    ndf.loc[ndf['tag'] == 'Technik'],
    ndf.loc[ndf['tag'] == 'Panorama'],
    ndf.loc[ndf['tag'] == 'Kultur']
]

In [15]:
plt.title('KW 50')
plt.axis('off')
for l in stuff:
    tag = l['tag'].iloc[0]
    plt.scatter(l[0], l[1], c=colors[tag], label=tag, marker='x')
plt.legend()
plt.show()