In [4]:
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import matplotlib.pyplot as plt

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [5]:
files = [
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt'),
    path.join('data', 'Archive', 'Stern_Politik.txt'),
    path.join('data', 'Archive', 'Stern_Panorama.txt'),
    path.join('data', 'Archive', 'Stern_Wirtschaft.txt')]
tags = ['Politik', 'Wirtschaft', 'Wissenschaft', 'Auto', 'Bildung', 'Geschichte', 
        'Kultur', 'Panorama', 'Reise', 'Sport', 'Technik', 'Politik', 'Panorama', 'Wirtschaft']
colors = {'Politik': 'red', 
          'Wirtschaft': 'blue', 
          'Wissenschaft': 'cyan', 
          'Auto': 'yellow', 
          'Bildung': 'green', 
          'Geschichte': 'grey', 
          'Kultur': 'purple', 
          'Panorama': 'orange', 
          'Reise': 'lime', 
          'Sport': 'brown', 
          'Technik': 'black'}

In [6]:
X, y = [], []
for file, tag in zip(files, tags):
    for doc in FileCorpus([file]).doc_token(stopwords=stopwords):
        X.append(' '.join(doc))
        y.append(tag)

df = pd.DataFrame()
df['doc'] = X
df['tag'] = y

In [7]:
df = df.iloc[np.random.permutation(len(df))]

In [8]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
vectorizer.fit(df['doc'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
training = df[:550000]
test = df[550000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])

In [10]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [11]:
classifier.score(test_tfidf, test['tag'])

0.86440653420610558

In [12]:
# joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
# joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))

In [15]:
from corputil import ListCorpus

load = pd.read_csv(path.join('data', 'CurrentNews', '2015KW44.csv'), sep='|', encoding='utf-8')
KW44 = list(ListCorpus(list(load.loc[:, 'text'])).doc_token(stopwords=stopwords))

In [16]:
nX = []
for doc in KW44:
    nX.append(' '.join(doc))

KW44 = None

tfidf = vectorizer.transform(nX)
labels = list(classifier.predict(tfidf))

In [17]:
prep = TruncatedSVD(n_components=50, random_state=0).fit_transform(tfidf)
#reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(prep)

In [20]:
prep[0]

array([  4.75329662e-01,  -9.33169483e-02,   2.14258408e-02,
        -1.18644436e-01,   7.82301730e-02,   1.86708974e-01,
         1.15395966e-02,   1.99937221e-01,  -1.26114966e-01,
        -4.54818739e-03,  -8.27293248e-02,  -5.03009822e-02,
         3.08224021e-03,   1.93017621e-03,  -7.08446166e-02,
        -1.05548332e-02,  -5.22478803e-02,  -6.98948640e-02,
        -1.36784691e-01,   1.11767380e-01,   5.25786045e-03,
        -1.40401192e-01,  -7.56539773e-02,  -4.32296892e-02,
        -2.46994906e-02,  -7.07501085e-02,  -1.46495009e-02,
        -1.08252763e-02,  -4.89016499e-02,   2.69399827e-02,
         4.82372527e-03,  -7.29107125e-03,   2.91763695e-02,
         3.56682440e-02,  -7.43758433e-03,   1.36151983e-02,
        -3.28462466e-02,   1.20567469e-02,  -3.41455614e-02,
        -1.34318310e-02,   2.11979299e-02,  -6.10165442e-02,
        -1.26459380e-02,   8.11566900e-03,   1.00700285e-02,
        -3.85349196e-02,  -7.27082171e-02,  -3.67246859e-04,
         1.69218576e-02,

In [13]:
ndf = pd.DataFrame(reduced)
ndf['tag'] = labels

In [14]:
stuff = [
    ndf.loc[ndf['tag'] == 'Politik'],
    ndf.loc[ndf['tag'] == 'Wirtschaft'],
    ndf.loc[ndf['tag'] == 'Wissenschaft'],
    ndf.loc[ndf['tag'] == 'Auto'],
    ndf.loc[ndf['tag'] == 'Bildung'],
    ndf.loc[ndf['tag'] == 'Geschichte'],
    ndf.loc[ndf['tag'] == 'Reise'],
    ndf.loc[ndf['tag'] == 'Sport'],
    ndf.loc[ndf['tag'] == 'Technik'],
    ndf.loc[ndf['tag'] == 'Panorama'],
    ndf.loc[ndf['tag'] == 'Kultur']
]

In [15]:
plt.title('KW 50')
plt.axis('off')
for l in stuff:
    tag = l['tag'].iloc[0]
    plt.scatter(l[0], l[1], c=colors[tag], label=tag, marker='x')
plt.legend()
plt.show()