In [1]:
import pandas as pd

# read in

In [2]:
df = pd.read_csv('../../2_Feature_Engineering/export/randomized_balanced_dataset.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label
0,"Ich warte schon darauf, dass man die Terrorist...",0.0,False
1,Holt Björn Höcke ins Boot vielleicht haben wir...,0.0,False
2,"Deutscher, turkmenischer Abstammung bitte. Was...",0.0,False
3,"Wer glaubt die gehen wieder , ist einfach nur ...",0.0,False
4,Man wünscht sich eine schlagkräftige Bürgerweh...,0.0,False


# clean

In [3]:
import sys
sys.path.insert(0, r'..\..\0_common')
from model_helpers import clean_all as ci

In [4]:
df['cleaned_text'] = df['raw_text'].apply(ci)
df.head()

Unnamed: 0,raw_text,severity,binary_label,cleaned_text
0,"Ich warte schon darauf, dass man die Terrorist...",0.0,False,warte schon darauf das terroristen opfern mach...
1,Holt Björn Höcke ins Boot vielleicht haben wir...,0.0,False,holt björn höcke boot vielleicht letzte chance...
2,"Deutscher, turkmenischer Abstammung bitte. Was...",0.0,False,deutscher turkmenischer abstammung bitte graue...
3,"Wer glaubt die gehen wieder , ist einfach nur ...",0.0,False,wer glaubt gehen einfach blauäugig
4,Man wünscht sich eine schlagkräftige Bürgerweh...,0.0,False,wünscht schlagkräftige bürgerwehr sophienhof p...


# vectorize

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vec = CountVectorizer(ngram_range=(2,5), analyzer='char_wb', stop_words='german', max_features=100000)

In [7]:
X_features = vec.fit_transform(df['cleaned_text'])

In [8]:
X_features.shape

(1828, 45712)

# main function

In [9]:
def split_train_score(features, labels, clf):
    
    # split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True)
    
    #train    
    clf.fit(X_train, y_train)
    
    # score
    from sklearn.metrics import precision_recall_fscore_support as score
    y_pred = clf.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')
    
    # confusion matrix
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(y_test, y_pred)
    m_df = pd.DataFrame(
    matrix, 
    columns=["Negatives", "Positives"],
    index=["Negatives", "Positives"])
    
    return fscore, precision, recall, m_df

# Evaluate Function

In [10]:
scores = {}
matrices = {}

In [11]:
def evaluate_algo(key, clf):
    scores[key] = []
    matrices[key] = []

    for i in range(10):
        fscore, precision, recall, confusion_matrix = split_train_score(X_features, df['binary_label'], clf)
        scores[key].append([fscore, precision, recall])
        matrices[key].append(confusion_matrix)

    scores_df = pd.DataFrame(scores[key], columns=['fscore', 'precision', 'recall']) 
    fscore_avg = round(scores_df['fscore'].mean(),3)
    prec_avg = round(scores_df['precision'].mean(),3)
    rec_avg = round(scores_df['recall'].mean(),3)
    
    '''
    print(key.upper())
    print('---')
    print('Fscore: ', fscore_avg)
    print('Min/Max: {} / {} '.format(round(scores_df['fscore'].max(), 3), round(scores_df['fscore'].min(),3)))
    '''
    
    return fscore_avg, prec_avg, rec_avg

# Evaluate

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

algos = {
    'lr': LogisticRegression(),
    'dct': DecisionTreeClassifier(),
    'bnb': BernoulliNB(),
    'mnb': MultinomialNB(),
    'svm': SGDClassifier(),
    'rf': RandomForestClassifier()
}

In [13]:
for i in range(3):    
    results = {}

    for index, clf in algos.items():
        fscore, precision, recall = evaluate_algo(index, clf)
        results[index] = [fscore, precision, recall]

    result_df = pd.DataFrame.from_dict(results, orient = 'index', columns=['fscore', 'precision', 'recall']).sort_values('fscore', ascending=False)
    print(result_df)



     fscore  precision  recall
lr    0.728      0.765   0.695
mnb   0.727      0.709   0.748
dct   0.701      0.714   0.691
svm   0.681      0.654   0.723
rf    0.668      0.762   0.597
bnb   0.615      0.769   0.512




     fscore  precision  recall
lr    0.736      0.758   0.716
mnb   0.717      0.695   0.742
dct   0.712      0.725   0.702
rf    0.667      0.776   0.585
svm   0.666      0.683   0.665
bnb   0.620      0.773   0.519




     fscore  precision  recall
lr    0.746      0.778   0.717
mnb   0.720      0.710   0.733
dct   0.710      0.726   0.695
svm   0.672      0.674   0.694
rf    0.661      0.780   0.575
bnb   0.591      0.754   0.488
