In [65]:
import pandas as pd

# read in

In [66]:
df = pd.read_csv('../../2_Feature_Engineering/export/randomized_balanced_dataset.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label
0,"Ich warte schon darauf, dass man die Terrorist...",0.0,False
1,Holt Björn Höcke ins Boot vielleicht haben wir...,0.0,False
2,"Deutscher, turkmenischer Abstammung bitte. Was...",0.0,False
3,"Wer glaubt die gehen wieder , ist einfach nur ...",0.0,False
4,Man wünscht sich eine schlagkräftige Bürgerweh...,0.0,False


# clean

In [67]:
import sys
sys.path.insert(0, r'..\..\0_common')
from model_helpers import clean_all as ci

In [68]:
df['cleaned_text'] = df['raw_text'].apply(ci)
df.head()

Unnamed: 0,raw_text,severity,binary_label,cleaned_text
0,"Ich warte schon darauf, dass man die Terrorist...",0.0,False,warte schon darauf das terroristen opfern mach...
1,Holt Björn Höcke ins Boot vielleicht haben wir...,0.0,False,holt björn höcke boot vielleicht letzte chance...
2,"Deutscher, turkmenischer Abstammung bitte. Was...",0.0,False,deutscher turkmenischer abstammung bitte graue...
3,"Wer glaubt die gehen wieder , ist einfach nur ...",0.0,False,wer glaubt gehen einfach blauäugig
4,Man wünscht sich eine schlagkräftige Bürgerweh...,0.0,False,wünscht schlagkräftige bürgerwehr sophienhof p...


# vectorize

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [70]:
vec = TfidfVectorizer(max_df=0.85, ngram_range=(1,2), max_features=30000)

In [71]:
X_features = vec.fit_transform(df['cleaned_text'])

In [72]:
X_features.shape

(1828, 25593)

# main function

In [73]:
def split_train_score(features, labels, clf):
    
    # split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True)
    
    #train    
    clf.fit(X_train, y_train)
    
    # score
    from sklearn.metrics import precision_recall_fscore_support as score
    y_pred = clf.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')
    
    # confusion matrix
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(y_test, y_pred)
    m_df = pd.DataFrame(
    matrix, 
    columns=["Negatives", "Positives"],
    index=["Negatives", "Positives"])
    
    return fscore, precision, recall, m_df

# Evaluate Function

In [74]:
scores = {}
matrices = {}

In [75]:
def evaluate_algo(key, clf):
    scores[key] = []
    matrices[key] = []

    for i in range(10):
        fscore, precision, recall, confusion_matrix = split_train_score(X_features.toarray(), df['binary_label'], clf)
        scores[key].append([fscore, precision, recall])
        matrices[key].append(confusion_matrix)

    scores_df = pd.DataFrame(scores[key], columns=['fscore', 'precision', 'recall']) 
    fscore_avg = round(scores_df['fscore'].mean(),3)
    prec_avg = round(scores_df['precision'].mean(),3)
    rec_avg = round(scores_df['recall'].mean(),3)
    
    '''
    print(key.upper())
    print('---')
    print('Fscore: ', fscore_avg)
    print('Min/Max: {} / {} '.format(round(scores_df['fscore'].max(), 3), round(scores_df['fscore'].min(),3)))
    '''
    
    return fscore_avg, prec_avg, rec_avg

# Evaluate

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

algos = {
    'lr': LogisticRegression(solver='lbfgs'),
    'dct': DecisionTreeClassifier(),
    'bnb': BernoulliNB(),
    'mnb': MultinomialNB(),
    'svm': SGDClassifier(max_iter=5, tol=None),
    'rf': RandomForestClassifier(n_estimators=100)
}

In [77]:
for i in range(3):    
    results = {}

    for index, clf in algos.items():
        fscore, precision, recall = evaluate_algo(index, clf)
        results[index] = [fscore, precision, recall]

    result_df = pd.DataFrame.from_dict(results, orient = 'index', columns=['fscore', 'precision', 'recall']).sort_values('fscore', ascending=False)
    print(result_df)



     fscore  precision  recall
dct   0.626      0.718   0.557
rf    0.566      0.813   0.446
svm   0.552      0.746   0.552
mnb   0.487      0.828   0.347
lr    0.469      0.891   0.319
bnb   0.281      0.770   0.173




     fscore  precision  recall
dct   0.626      0.713   0.558
svm   0.609      0.705   0.605
rf    0.533      0.887   0.381
mnb   0.521      0.832   0.380
lr    0.486      0.882   0.337
bnb   0.223      0.706   0.133




     fscore  precision  recall
dct   0.642      0.727   0.578
svm   0.577      0.752   0.570
rf    0.538      0.837   0.411
lr    0.521      0.890   0.370
mnb   0.514      0.859   0.368
bnb   0.253      0.723   0.154


--> count vectorizer is better