In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('../../2_Feature_Engineering/export/randomized_balanced_dataset.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label
0,"Ich warte schon darauf, dass man die Terrorist...",0.0,False
1,Holt Björn Höcke ins Boot vielleicht haben wir...,0.0,False
2,"Deutscher, turkmenischer Abstammung bitte. Was...",0.0,False
3,"Wer glaubt die gehen wieder , ist einfach nur ...",0.0,False
4,Man wünscht sich eine schlagkräftige Bürgerweh...,0.0,False


# Clean text

In [12]:
import re
# remove links and user hashes
def clean_tweet(text):
    text = re.sub(r'http\S+', '', text) # links
    text = re.sub(r'^[a-f0-9]{16}', '', text) # user hashes
    text = re.sub(r'@(\w+)',  '', text) # usernames
    text = re.sub(r'RT',  '',text) # RT --> retweets
    return text
df['cleaned_text'] = df['raw_text'].apply(clean_tweet)

In [13]:
from model_helpers import clean_input as ci

In [14]:
df['cleaned_text'] = df['cleaned_text'].apply(ci)

# Vectorize

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf_trigram = TfidfVectorizer(ngram_range=(3,3), analyzer='char', stop_words='german')
tfidf_simple = TfidfVectorizer()

In [14]:
X_features_trigram = tfidf_trigram.fit_transform(df['raw_text'])
X_features_trigram

<1828x10999 sparse matrix of type '<class 'numpy.float64'>'
	with 219080 stored elements in Compressed Sparse Row format>

In [15]:
X_features_simple = tfidf_simple.fit_transform(df['cleaned_text'])
X_features_simple

<1828x8145 sparse matrix of type '<class 'numpy.float64'>'
	with 20628 stored elements in Compressed Sparse Row format>

# Train with K-Fold & Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.model_selection import KFold, cross_val_score

In [26]:
k_fold = KFold(n_splits=5, shuffle=True)

In [23]:
lr = LogisticRegression()

In [20]:
# logistic regression without n-grams
lr_score = cross_val_score(lr, X_features_simple, df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
lr_score

array([0.72131148, 0.7295082 , 0.72131148, 0.71506849, 0.69041096])

In [21]:
# k-fold with bigrams
lr_trigram_score = cross_val_score(lr, X_features_trigram, df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
lr_trigram_score

array([0.75956284, 0.74590164, 0.7568306 , 0.73150685, 0.75068493])

# Train with K-Fold & Multinomial Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
mnb = MultinomialNB()

In [24]:
# MNB without n-grams
mnb_score = cross_val_score(mnb, X_features_simple.toarray(), df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
mnb_score

array([0.71584699, 0.73770492, 0.68306011, 0.67123288, 0.72328767])

In [25]:
# MNB with bigrams
mnb_trigram_score = cross_val_score(mnb, X_features_trigram.toarray(), df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
mnb_trigram_score

array([0.69125683, 0.71038251, 0.71857923, 0.71232877, 0.64931507])

# results

at least with multinomial naive bayes character n-grams seem to work better

In [26]:
scores = {
    'mnb': sum(mnb_score) / len(mnb_score),
    'mnb_trigram': sum(mnb_trigram_score) / len(mnb_trigram_score),
    'lr': sum(lr_score) / len(lr_score),
    'lr_trigram': sum(lr_trigram_score) / len(lr_trigram_score),

}

for key, value in sorted(scores.items(), key=lambda item: item[1], reverse=True):
    print("%s: %s" % (key, value))

lr_trigram: 0.7488973725578262
lr: 0.7155221199191557
mnb: 0.7062265139606259
mnb_trigram: 0.6963724829702822


# try different character n-grams

In [27]:
tfidf_2_2 = TfidfVectorizer(ngram_range=(2,2), analyzer='char', stop_words='german', max_features=50000)
tfidf_2_3 = TfidfVectorizer(ngram_range=(2,3), analyzer='char', stop_words='german', max_features=50000)
tfidf_2_4 = TfidfVectorizer(ngram_range=(2,4), analyzer='char', stop_words='german', max_features=50000)

tfidf_3_3 = TfidfVectorizer(ngram_range=(3,3), analyzer='char', stop_words='german', max_features=50000)
tfidf_3_4 = TfidfVectorizer(ngram_range=(3,4), analyzer='char', stop_words='german', max_features=50000)

In [28]:
X_features_2_2 = tfidf_2_2.fit_transform(df['raw_text'])
X_features_2_3 = tfidf_2_3.fit_transform(df['raw_text'])
X_features_2_4 = tfidf_2_4.fit_transform(df['raw_text'])

X_features_3_3 = tfidf_3_3.fit_transform(df['raw_text'])
X_features_3_4 = tfidf_3_4.fit_transform(df['raw_text'])

In [29]:
options = {
    'X_features_2_2': X_features_2_2,
    'X_features_2_3': X_features_2_3,
    'X_features_2_4': X_features_2_4,
    'X_features_3_3': X_features_3_3,
    'X_features_3_4': X_features_3_4,
}

## Multinomial Naive Bayes

In [30]:
for index, features in options.items():
    score = cross_val_score(mnb, features.toarray(), df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
    avg = sum(score) / len(score)
    print("%s: %s (%s)" % (index, avg, score))

X_features_2_2: 0.6756254210644508 ([0.65027322 0.67213115 0.66120219 0.70410959 0.69041096])
X_features_2_3: 0.6767063402949323 ([0.66666667 0.69945355 0.64480874 0.70136986 0.67123288])
X_features_2_4: 0.674480125757916 ([0.67486339 0.69398907 0.70491803 0.64109589 0.65753425])
X_features_3_3: 0.6947675724230855 ([0.66939891 0.66666667 0.71311475 0.69863014 0.7260274 ])
X_features_3_4: 0.6920308406317839 ([0.6420765  0.70218579 0.69945355 0.68219178 0.73424658])


## Logistic Regression

In [31]:
for index, features in options.items():
    score = cross_val_score(lr, features.toarray(), df['binary_label'], cv=k_fold, scoring='accuracy', n_jobs=-1)
    avg = sum(score) / len(score)
    print("%s: %s (%s)" % (index, avg, score))

X_features_2_2: 0.7111595179279886 ([0.71311475 0.72404372 0.69672131 0.69041096 0.73150685])
X_features_2_3: 0.7418040272475485 ([0.71311475 0.75956284 0.73497268 0.75890411 0.74246575])
X_features_2_4: 0.7598143573620779 ([0.80054645 0.78415301 0.75409836 0.7260274  0.73424658])
X_features_3_3: 0.7598413054869375 ([0.76775956 0.76229508 0.75956284 0.73972603 0.76986301])
X_features_3_4: 0.7500097312673105 ([0.74043716 0.73224044 0.75956284 0.74520548 0.77260274])


# aun mas

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
from sklearn.pipeline import Pipeline

In [21]:
tfidf_vec = TfidfVectorizer(analyzer='char', stop_words='german')

In [28]:
parameters = {'tfidf_vec__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(2,2),(2,3),(2,4),(2,5),(3,3),(3,4),(3,5),(3,6),(4,4),(4,5),(4,6),(5,5),(5,6),(6,6)], 'tfidf_vec__max_features': [10000,50000,100000]}

## Logistic Regression

In [36]:
clf = Pipeline([('tfidf_vec', tfidf_vec), ('lr', lr)])
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1, scoring='f1', cv=k_fold)

In [37]:
gs_clf = gs_clf.fit(df['raw_text'], df['binary_label'])
sorted(gs_clf.cv_results_.keys())



['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_tfidf_vec__max_features',
 'param_tfidf_vec__ngram_range',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'split3_test_score',
 'split3_train_score',
 'split4_test_score',
 'split4_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [38]:
# best parameters
gs_clf.best_params_


{'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 5)}

In [39]:
means = gs_clf.cv_results_['mean_test_score']
stds = gs_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))

0.553 (+/-0.086) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 1)}
0.626 (+/-0.066) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 2)}
0.675 (+/-0.050) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 3)}
0.684 (+/-0.050) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 4)}
0.646 (+/-0.064) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 2)}
0.671 (+/-0.030) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 3)}
0.679 (+/-0.052) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 4)}
0.681 (+/-0.037) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 5)}
0.678 (+/-0.037) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 3)}
0.676 (+/-0.047) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 4)}
0.684 (+/-0.041) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 5)}
0.678 (+/-

In [29]:
# same on accuracy
clf = Pipeline([('tfidf_vec', tfidf_vec), ('lr', lr)])
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1, scoring='accuracy', cv=k_fold)
gs_clf = gs_clf.fit(df['raw_text'], df['binary_label'])

means = gs_clf.cv_results_['mean_test_score']
stds = gs_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))

0.641 (+/-0.026) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 1)}
0.701 (+/-0.049) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 2)}
0.743 (+/-0.042) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 3)}
0.753 (+/-0.027) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 4)}
0.716 (+/-0.029) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 2)}
0.747 (+/-0.044) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 3)}
0.763 (+/-0.038) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 4)}
0.754 (+/-0.038) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 5)}
0.753 (+/-0.049) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 3)}
0.763 (+/-0.046) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 4)}
0.753 (+/-0.043) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 5)}
0.752 (+/-



In [30]:
# best parameters
gs_clf.best_params_

{'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 4)}

# Multinomial Naive Bayes

In [42]:
# on accuracy
clf = Pipeline([('tfidf_vec', tfidf_vec), ('mnb', mnb)])
gs_clf_acc = GridSearchCV(clf, parameters, n_jobs=-1, scoring='accuracy', cv=k_fold)
gs_clf_acc = gs_clf_acc.fit(df['raw_text'], df['binary_label'])

means = gs_clf_acc.cv_results_['mean_test_score']
stds = gs_clf_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
    
print('best: {}'.format(gs_clf_acc.best_params_))

0.609 (+/-0.044) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 1)}
0.653 (+/-0.058) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 2)}
0.668 (+/-0.056) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 3)}
0.715 (+/-0.039) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 4)}
0.673 (+/-0.028) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 2)}
0.693 (+/-0.051) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 3)}
0.726 (+/-0.034) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 4)}
0.736 (+/-0.037) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 5)}
0.701 (+/-0.047) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 3)}
0.736 (+/-0.032) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 4)}
0.743 (+/-0.039) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 5)}
0.737 (+/-

In [43]:
# on f1
clf = Pipeline([('tfidf_vec', tfidf_vec), ('mnb', mnb)])
gs_clf_acc = GridSearchCV(clf, parameters, n_jobs=-1, scoring='f1', cv=k_fold)
gs_clf_acc = gs_clf_acc.fit(df['raw_text'], df['binary_label'])

means = gs_clf_acc.cv_results_['mean_test_score']
stds = gs_clf_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
    
print('best: {}'.format(gs_clf_acc.best_params_))

0.301 (+/-0.074) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 1)}
0.448 (+/-0.073) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 2)}
0.452 (+/-0.091) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 3)}
0.589 (+/-0.102) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (1, 4)}
0.530 (+/-0.073) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 2)}
0.532 (+/-0.063) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 3)}
0.622 (+/-0.103) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 4)}
0.627 (+/-0.080) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (2, 5)}
0.558 (+/-0.075) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 3)}
0.641 (+/-0.095) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 4)}
0.651 (+/-0.073) for {'tfidf_vec__max_features': 10000, 'tfidf_vec__ngram_range': (3, 5)}
0.648 (+/-

# Resultat
Bisher bestes Resultat
- Logistic Regression Classifier
- TF-IDF Vectorizer mit {'tfidf_vec__max_features': 100000, 'tfidf_vec__ngram_range': (2, 5)}