In [40]:
import pandas as pd

# read in

In [41]:
df = pd.read_csv('../../2_Feature_Engineering/export/combined_polly_bretschneider_iwg.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label
0,ist ekelhaft bei ihren Klos kannst du nur mit...,0.0,False
1,Bevor Sie mit Jud...,1.0,True
2,Und de Amis freut es wenn die Heuschrecken Eur...,0.0,False
3,": Die Krim ist ein heißer Tip, auch zum In...",1.0,True
4,: Es muss in #Deutschland eine politische Kra...,1.0,True


# clean

In [42]:
import sys
sys.path.insert(0, r'..\..\0_common')
from model_helpers import clean_all as ci

In [43]:
df['cleaned_text'] = df['raw_text'].apply(ci)
df.head()

Unnamed: 0,raw_text,severity,binary_label,cleaned_text
0,ist ekelhaft bei ihren Klos kannst du nur mit...,0.0,False,ekelhaft klos kannst gummistiefel gehen stinkt...
1,Bevor Sie mit Jud...,1.0,True,bevor judenstern markiert worden juden ganze ...
2,Und de Amis freut es wenn die Heuschrecken Eur...,0.0,False,de amis freut heuschrecken europa klein wirtsc...
3,": Die Krim ist ein heißer Tip, auch zum In...",1.0,True,krim heißer tip investieren muß bürger landes...
4,: Es muss in #Deutschland eine politische Kra...,1.0,True,deutschland politische kraft geben sozial sch...


In [44]:
df.shape

(9169, 4)

# vectorize

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
vec = CountVectorizer(ngram_range=(3,5), analyzer='char_wb', max_features=100000)

In [47]:
X_features = vec.fit_transform(df['cleaned_text'])

In [48]:
X_features.shape

(9169, 74355)

# initialize model

In [49]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1, n_estimators=200)

# Cross validation

In [50]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X_features, df['binary_label'], cv=5,
                        scoring=('f1', 'roc_auc', 'precision', 'recall'),
                        return_train_score=True)

In [51]:
sorted(scores.keys())

['fit_time',
 'score_time',
 'test_f1',
 'test_precision',
 'test_recall',
 'test_roc_auc',
 'train_f1',
 'train_precision',
 'train_recall',
 'train_roc_auc']

In [52]:
scores_df = pd.DataFrame(scores)
scores_df

Unnamed: 0,fit_time,score_time,test_f1,train_f1,test_roc_auc,train_roc_auc,test_precision,train_precision,test_recall,train_recall
0,11.690349,1.323895,0.908146,1.0,0.971496,1.0,0.906574,1.0,0.909722,1.0
1,11.9251,1.302445,0.928612,1.0,0.978066,1.0,0.916573,1.0,0.940972,1.0
2,11.712773,1.318674,0.915332,1.0,0.973986,1.0,0.904977,1.0,0.925926,1.0
3,11.818397,1.312382,0.936,1.0,0.982119,1.0,0.924379,1.0,0.947917,1.0
4,11.718653,1.314856,0.927355,1.0,0.980811,1.0,0.908788,1.0,0.946698,1.0


In [53]:
# metrics
scores_df[['test_f1', 'test_roc_auc', 'test_precision', 'test_recall']].mean()

test_f1           0.923089
test_roc_auc      0.977296
test_precision    0.912258
test_recall       0.934247
dtype: float64

---> Random Forest with char n-grams seems to be the best, over all in case of recall

# finally train and export

In [55]:
# split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, df['binary_label'], test_size=0.2, shuffle=True, stratify=df['binary_label'])

In [56]:
# train
clf = RandomForestClassifier(n_jobs=-1, n_estimators=200)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [58]:
# metrics
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)

matrix = confusion_matrix(y_test, y_pred)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')

In [59]:
# show metrics
print('Recall: {} / Fscore: {} / Precision: {}'.format(round(recall, 3), 
                                                             round(fscore, 3),
                                                             round(precision, 3)))

Recall: 0.929 / Fscore: 0.919 / Precision: 0.908


In [62]:
# show confusion matrix
pd.DataFrame(
    matrix, 
    columns=["Predicted Negative", "Predicted Positive"],
    index=["Actually Negative", "Actually Positive"])

Unnamed: 0,Predicted Negative,Predicted Positive
Actually Negative,889,81
Actually Positive,61,803


In [64]:
# export
import pickle
pickle.dump(clf, open('exports/{}_model.pkl'.format('rf3'), 'wb'))

In [65]:
# export vectorizer
pickle.dump(vec, open('exports/char_vectorizer.pkl', 'wb'))