## Import necessary libraries

In [67]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, hamming_loss, confusion_matrix
from sklearn.externals import joblib

## Read data

In [68]:
data = pd.read_csv("training_data_posts.csv",sep=',', header=None, names=["text", "label"], index_col=False)

In [69]:
data

Unnamed: 0,text,label
0,Der Vektorbegriff ist für die Schüler kein Pro...,public
1,"Arbeiten ist gut, ich musste mir auch seit jeh...",public
2,"wenn man ""die Studierenden"" bzw. ""die Lehrende...",public
3,Also Koller (NT) - Janko ist schon eine perfek...,public
4,Sind es eigentlich ein und die gleichen Mensch...,public
5,Das Video aus Polen soll keiner sehen? Auf You...,public
6,Na fangen's als Erwachsener mit eigenem Hausha...,public
7,"Was wollen Sie? Er schreibt doch, daß er sich ...",public
8,Ich hbe de Bchstbn as der Tastrt entfernt! Si...,public
9,Senkung der Pull-Faktoren: man müsste nur die ...,public


## stop words

Use stop_words to remove less-meaningful words. The logic of removing stop words has to do with the fact that these words don't carry a lot of meaning, and they appear a lot in most text

In [70]:
import io
import unidecode
with io.open('stopwords_german.txt', mode='r', encoding='utf-8') as f:
      content = f.readlines()
content = [x.strip() for x in content]
content = [unidecode.unidecode(x) for x in content]
content.append('rt')

In [71]:
content

['a',
 'ab',
 'aber',
 'ach',
 'acht',
 'achte',
 'achten',
 'achter',
 'achtes',
 'ag',
 'alle',
 'allein',
 'allem',
 'allen',
 'aller',
 'allerdings',
 'alles',
 'allgemeinen',
 'als',
 'also',
 'am',
 'an',
 'ander',
 'andere',
 'anderem',
 'anderen',
 'anderer',
 'anderes',
 'anderm',
 'andern',
 'anderr',
 'anders',
 'au',
 'auch',
 'auf',
 'aus',
 'ausser',
 'ausserdem',
 'ausser',
 'ausserdem',
 'b',
 'bald',
 'bei',
 'beide',
 'beiden',
 'beim',
 'beispiel',
 'bekannt',
 'bereits',
 'besonders',
 'besser',
 'besten',
 'bin',
 'bis',
 'bisher',
 'bist',
 'c',
 'd',
 'd.h',
 'da',
 'dabei',
 'dadurch',
 'dafur',
 'dagegen',
 'daher',
 'dahin',
 'dahinter',
 'damals',
 'damit',
 'danach',
 'daneben',
 'dank',
 'dann',
 'daran',
 'darauf',
 'daraus',
 'darf',
 'darfst',
 'darin',
 'darum',
 'darunter',
 'daruber',
 'das',
 'dasein',
 'daselbst',
 'dass',
 'dasselbe',
 'davon',
 'davor',
 'dazu',
 'dazwischen',
 'dass',
 'dein',
 'deine',
 'deinem',
 'deinen',
 'deiner',
 'deines',

## split data

In [72]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.1, random_state=1234)

In [73]:
X_train.shape

(306,)

In [74]:
y_train.shape

(306,)

## Define ML pipeline

Set ngram_range to (1,1) for outputting only one-word tokens, (1,2) for one-word and two-word tokens, (2, 3) for two-word and three-word tokens, etc.

ngram_range works hand-in-hand with analyzer. Set analyzer to "word" for outputting words and phrases, or set it to "char" to output character ngrams.

If you want your output to have both "word" and "char" features, use sklearn's FeatureUnion.

Since stop words generally have a high frequency, it might make sense to use max_df as a float of say 0.95 to remove the top 5% but then you're assuming that the top 5% is all stop words which might not be the case. It really depends on your text data. In some lines of work, it's very common that the top words or phrases are NOT stop words because you work with dense text (search query data) in very specific topics.

Use min_df as an integer to remove rare-occurring words. If they only occur once or twice, they won't add much value and are usually really obscure. Furthermore, there's generally a lot of them so ignoring them with say min_df=5 can greatly reduce your memory consumption and data size.

token_pattern allows you to use a regex pattern e.g. \b\w\w+\b which means that tokens have to be at least 2 characters long so words like "I", "a" are removed and also numbers like 0 - 9 are removed. You'll also notice it removes apostrophes.

In [75]:
pipeline = Pipeline([
    (
        'tfidv',
        TfidfVectorizer(
            ngram_range=(1,3), 
            analyzer='word', 
            strip_accents = 'unicode', 
            use_idf = True, #NOTE: use_idf=False AND norm=None is equivalent to using sklearn's CountVectorizer. It will just return counts.
            stop_words=content,
            sublinear_tf=True, 
            max_features=100, 
            min_df=2, 
            max_df=1.0
        )
    ),
    (
        'lin_svc',
        svm.SVC(
            C=1.0,
            probability=True,
            kernel='linear'
        )
    )
])

Pipeline(memory=None,
     steps=[('tfidv', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=6000, min_df=2,
        ngram_range=(1, 20), norm='l2', preprocessor=None, smooth_idf=T...ar', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

## Train model

In [None]:
pipeline.fit(X_train,y_train)

## Score model

In [76]:
def score_model(true, pred):
    print('Accuracy:', accuracy_score(true, pred))
    print('F1:', f1_score(true, pred, average='weighted'))
    print('Precision:', precision_score(true, pred, average='weighted'))
    print('Hamming loss', hamming_loss(true, pred))


score_model(y_test,pipeline.predict(X_test))

Accuracy: 0.9714285714285714
F1: 0.9712316968130923
Precision: 0.9727272727272728
Hamming loss 0.02857142857142857


## Save model

In [84]:
#joblib.dump(pipeline, 'model.pkl', compress=3)


['model.pkl']