In [1]:
import pandas as pd

# read in

In [2]:
df = pd.read_csv('../../2_Feature_Engineering/export/combined_polly_bretschneider_iwg.csv')
df.head()

Unnamed: 0,raw_text,severity,binary_label
0,ist ekelhaft bei ihren Klos kannst du nur mit...,0.0,False
1,Bevor Sie mit Jud...,1.0,True
2,Und de Amis freut es wenn die Heuschrecken Eur...,0.0,False
3,": Die Krim ist ein heißer Tip, auch zum In...",1.0,True
4,: Es muss in #Deutschland eine politische Kra...,1.0,True


# clean

In [3]:
import sys
sys.path.insert(0, r'..\..\0_common')
from model_helpers import clean_all as ci

In [4]:
df['cleaned_text'] = df['raw_text'].apply(ci)
df.head()

Unnamed: 0,raw_text,severity,binary_label,cleaned_text
0,ist ekelhaft bei ihren Klos kannst du nur mit...,0.0,False,ekelhaft klos kannst gummistiefel gehen stinkt...
1,Bevor Sie mit Jud...,1.0,True,bevor judenstern markiert worden juden ganze ...
2,Und de Amis freut es wenn die Heuschrecken Eur...,0.0,False,de amis freut heuschrecken europa klein wirtsc...
3,": Die Krim ist ein heißer Tip, auch zum In...",1.0,True,krim heißer tip investieren muß bürger landes...
4,: Es muss in #Deutschland eine politische Kra...,1.0,True,deutschland politische kraft geben sozial sch...


In [5]:
df.shape

(9169, 4)

# vectorize

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vec = TfidfVectorizer(ngram_range=(1,3), analyzer='word', max_features=10000)

In [10]:
X_features = vec.fit_transform(df['cleaned_text'])

In [12]:
X_features.shape

(9169, 10000)

# main function

In [13]:
def split_train_score(features, labels, clf):
    
    # split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True)
    
    #train    
    clf.fit(X_train, y_train)
    
    # score
    from sklearn.metrics import precision_recall_fscore_support as score
    y_pred = clf.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')
    
    # confusion matrix
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(y_test, y_pred)
    m_df = pd.DataFrame(
    matrix, 
    columns=["Negatives", "Positives"],
    index=["Negatives", "Positives"])
    
    return fscore, precision, recall, m_df

# Evaluate Function

In [14]:
scores = {}
matrices = {}

In [15]:
def evaluate_algo(key, clf):
    scores[key] = []
    matrices[key] = []

    for i in range(10):
        fscore, precision, recall, confusion_matrix = split_train_score(X_features, df['binary_label'], clf)
        scores[key].append([fscore, precision, recall])
        matrices[key].append(confusion_matrix)

    scores_df = pd.DataFrame(scores[key], columns=['fscore', 'precision', 'recall']) 
    fscore_avg = round(scores_df['fscore'].mean(),3)
    prec_avg = round(scores_df['precision'].mean(),3)
    rec_avg = round(scores_df['recall'].mean(),3)
    
    '''
    print(key.upper())
    print('---')
    print('Fscore: ', fscore_avg)
    print('Min/Max: {} / {} '.format(round(scores_df['fscore'].max(), 3), round(scores_df['fscore'].min(),3)))
    '''
    
    return fscore_avg, prec_avg, rec_avg

# Evaluate

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

algos = {
    'lr': LogisticRegression(),
    'dct': DecisionTreeClassifier(),
    'bnb': BernoulliNB(),
    'mnb': MultinomialNB(),
    'svm': SGDClassifier(),
    'rf': RandomForestClassifier()
}

In [17]:
for i in range(3):    
    results = {}

    for index, clf in algos.items():
        fscore, precision, recall = evaluate_algo(index, clf)
        results[index] = [fscore, precision, recall]

    result_df = pd.DataFrame.from_dict(results, orient = 'index', columns=['fscore', 'precision', 'recall']).sort_values('fscore', ascending=False)
    print(result_df)



     fscore  precision  recall
rf    0.881      0.906   0.857
dct   0.879      0.880   0.878
svm   0.853      0.872   0.836
lr    0.835      0.858   0.814
mnb   0.817      0.867   0.773
bnb   0.752      0.946   0.623




     fscore  precision  recall
rf    0.883      0.912   0.857
dct   0.877      0.879   0.875
svm   0.856      0.869   0.844
lr    0.838      0.863   0.815
mnb   0.816      0.861   0.776
bnb   0.742      0.946   0.611




     fscore  precision  recall
dct   0.881      0.882   0.881
rf    0.877      0.901   0.855
svm   0.856      0.874   0.840
lr    0.829      0.855   0.805
mnb   0.815      0.861   0.774
bnb   0.742      0.944   0.611
