In [20]:
import pandas as pd

# read in

In [21]:
df = pd.read_csv('../../2_Feature_Engineering/export/v3_randomized_balanced_dataset.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label,%_swear_words
0,"Und pinkelt dann vorne über die Brille, die Sa...",1.0,True,9.090909
1,der menschenschrott darf alles,1.0,True,0.0
2,"Das gute daran ,dieses pack legt sich selber um",1.0,True,0.0
3,Und solches PACK soll Integriert werden ? Na p...,0.5,True,0.0
4,Bei der geistigen Umnachtung der deutschen Bun...,0.5,True,0.0


# clean

In [22]:
import sys
sys.path.insert(0, r'..\..\0_common')
from model_helpers import clean_all as ci

In [23]:
df['cleaned_text'] = df['raw_text'].apply(ci)
df.head()

Unnamed: 0,raw_text,severity,binary_label,%_swear_words,cleaned_text
0,"Und pinkelt dann vorne über die Brille, die Sa...",1.0,True,9.090909,pinkelt vorne brille sau
1,der menschenschrott darf alles,1.0,True,0.0,menschenschrott darf
2,"Das gute daran ,dieses pack legt sich selber um",1.0,True,0.0,gute daran pack legt selber
3,Und solches PACK soll Integriert werden ? Na p...,0.5,True,0.0,pack integriert na prost mahlzeit
4,Bei der geistigen Umnachtung der deutschen Bun...,0.5,True,0.0,geistigen umnachtung deutschen bundeskanzlerin...


# vectorize

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
vec = CountVectorizer(ngram_range=(2,5), analyzer='char_wb', stop_words='german', max_features=10000)

In [26]:
X_features = vec.fit_transform(df['cleaned_text'])

In [27]:
X_features.shape

(1615, 10000)

# main function

In [28]:
def split_train_score(features, labels, clf):
    
    # split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True)
    
    #train    
    clf.fit(X_train, y_train)
    
    # score
    from sklearn.metrics import precision_recall_fscore_support as score
    y_pred = clf.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')
    
    # confusion matrix
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(y_test, y_pred)
    m_df = pd.DataFrame(
    matrix, 
    columns=["Negatives", "Positives"],
    index=["Negatives", "Positives"])
    
    return fscore, precision, recall, m_df

# Evaluate Function

In [29]:
scores = {}
matrices = {}

In [30]:
def evaluate_algo(key, clf):
    scores[key] = []
    matrices[key] = []

    for i in range(10):
        fscore, precision, recall, confusion_matrix = split_train_score(X_features, df['binary_label'], clf)
        scores[key].append([fscore, precision, recall])
        matrices[key].append(confusion_matrix)

    scores_df = pd.DataFrame(scores[key], columns=['fscore', 'precision', 'recall']) 
    fscore_avg = round(scores_df['fscore'].mean(),3)
    prec_avg = round(scores_df['precision'].mean(),3)
    rec_avg = round(scores_df['recall'].mean(),3)
    
    '''
    print(key.upper())
    print('---')
    print('Fscore: ', fscore_avg)
    print('Min/Max: {} / {} '.format(round(scores_df['fscore'].max(), 3), round(scores_df['fscore'].min(),3)))
    '''        
    return fscore_avg, prec_avg, rec_avg

# Evaluate

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

algos = {
    'lr': LogisticRegression(),
    'dct': DecisionTreeClassifier(),
    'bnb': BernoulliNB(),
    'mnb': MultinomialNB(),
    'svm': SGDClassifier(),
    'rf': RandomForestClassifier()
}

In [32]:
for i in range(3):    
    results = {}

    for index, clf in algos.items():
        fscore, precision, recall = evaluate_algo(index, clf)
        results[index] = [fscore, precision, recall]

    result_df = pd.DataFrame.from_dict(results, orient = 'index', columns=['fscore', 'precision', 'recall']).sort_values('fscore', ascending=False)
    print(result_df)



     fscore  precision  recall
lr    0.718      0.745   0.695
mnb   0.705      0.694   0.715
dct   0.677      0.700   0.657
svm   0.676      0.635   0.751
rf    0.673      0.751   0.611
bnb   0.643      0.700   0.597




     fscore  precision  recall
lr    0.716      0.744   0.690
mnb   0.709      0.695   0.725
rf    0.684      0.776   0.612
dct   0.677      0.683   0.672
svm   0.658      0.656   0.673
bnb   0.642      0.711   0.588




     fscore  precision  recall
lr    0.745      0.773   0.720
mnb   0.706      0.695   0.718
dct   0.695      0.699   0.693
rf    0.668      0.781   0.586
svm   0.660      0.623   0.720
bnb   0.657      0.724   0.604
