In [1]:
import pandas as pd

In [3]:
df = pd.read_csv(r'../../1_Data_Cleaning/Dataset_Facebook_Bretschneider_Peters_2016/comments.csv')

In [7]:
# generate label from valence
def computeLabel(val):
    value = val.split(',') if isinstance(val, str) else False
    if(value):
        return int(sum(map(int,value)) / len(value))
    else:
        return 0
df['label'] = df['valence'].apply(lambda x: computeLabel(x))

In [8]:
# generate binary label from label
df['binary_label'] = df['valence'].apply(lambda x: computeLabel(x) > 0)

In [9]:
df.head()

Unnamed: 0,comment_id,comment,author,valence,expert_ids,target_type,post_id,post,label,binary_label
0,1,gleich an die wand stellen und erschiessen..,d766dfb93914e8da,22.0,13.0,22.0,1,tja .... das Ergebniss ungebremsten Zuzug ...,2,True
1,2,nicht dass ich der Grundbotschaft dieses Post'...,6e49f84e2a5a1f05,,,,1,tja .... das Ergebniss ungebremsten Zuzug ...,0,False
2,3,"Das mit dem ""an die Wand stellen und erschiessen""",6ab1ef4615c45a79,,,,1,tja .... das Ergebniss ungebremsten Zuzug ...,0,False
3,4,"Seit dem ""an die Wand stellen und erschiessen""...",6ab1ef4615c45a79,,,,1,tja .... das Ergebniss ungebremsten Zuzug ...,0,False
4,5,Ja ja die Kriminelle Heimatpartei FPÖ von Kind...,03c34eadeee054d7,,,,1,tja .... das Ergebniss ungebremsten Zuzug ...,0,False


In [13]:
from model_helpers import clean_input as ci

In [20]:
df['comment_cleaned'] = df['comment'].apply(ci)

In [21]:
df_hs = df.filter(['comment', 'comment_cleaned', 'label', 'binary_label'], axis=1)

In [23]:
df_hs.head(20)

Unnamed: 0,comment,comment_cleaned,label,binary_label
0,gleich an die wand stellen und erschiessen..,gleich wand stellen erschiessen,2,True
1,nicht dass ich der Grundbotschaft dieses Post'...,das grundbotschaft post widersprechen 1 rechts...,0,False
2,"Das mit dem ""an die Wand stellen und erschiessen""",wand stellen erschiessen,0,False
3,"Seit dem ""an die Wand stellen und erschiessen""...",seit wand stellen erschiessen deutschland etwa...,0,False
4,Ja ja die Kriminelle Heimatpartei FPÖ von Kind...,ja ja kriminelle heimatpartei fpö kinderpornog...,0,False
5,Ihr seids empfindlich . . Is beste wos uns pas...,seids empfindlich is beste wos passieren is he...,2,True
6,"Mei bitte, sie sollen sich alle gegenseitig ab...",mei bitte sollen gegenseitig abstechen problem...,2,True
7,Wahnsinn.... :-(,wahnsinn,0,False
8,Alles nur traumatisierte Kindlein. Die von de...,traumatisierte kindlein grünen bezahlten anwäl...,0,False
9,Fachkräfte bei der Arbeit. Ein Hirnchirurg und...,fachkräfte arbeit hirnchirurg atomphysiker,0,False


In [26]:
from sklearn.model_selection import train_test_split

In [37]:
# split virgin dataset
X_train, X_test, y_train, y_test = train_test_split(df_hs['comment_cleaned'], df_hs['binary_label'], test_size=0.2)

In [38]:
# vectorize training data
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()

X_train_vect = vect.fit_transform(X_train)

In [39]:
counts = df_hs.label.value_counts()
print(counts)

0    5162
2     374
1     300
Name: label, dtype: int64


In [40]:
counts = df_hs.binary_label.value_counts()
print(counts)

False    5162
True      674
Name: binary_label, dtype: int64


--> Labels sind sehr ungleich verteilt. Ausgleich (bei Trainingsdaten) mit SMOTE

## Balance Dataset

In [44]:
# install imblearn
import sys
!{sys.executable} -m pip install imblearn



In [41]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

In [42]:
X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)

In [43]:
import numpy as np
unique, counts = np.unique(y_train_res, return_counts=True)
print(list(zip(unique, counts)))

[(False, 4115), (True, 4115)]


# Train

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

In [48]:
rf.fit(X_train_res, y_train_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [49]:
rf.score(X_train_res, y_train_res)

0.8705953827460511

## Test

In [50]:
X_test_vect = vect.transform(X_test)

y_pred = rf.predict(X_test_vect)

y_pred

array([False, False, False, ..., False, False, False])

In [51]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 89.04%

F1 Score: 32.63

COnfusion Matrix:
 [[1009   38]
 [  90   31]]


In [57]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(y_test, y_pred, pos_label=True, average='binary')
                                           
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), 
                                                         round(recall, 3), 
                                                         round((y_pred==y_test).sum() / len(y_pred))))

Precision: 0.436 / Recall: 0.262 / Accuracy: 1.0


In [53]:
# a monte carlo method

from sklearn.model_selection import ShuffleSplit

X = df_hs.comment_cleaned
y = df.binary_label

ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()

accs = []
f1s = []
cms = []

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    rf.fit(X_train_res, y_train_res)
    y_pred = rf.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    
print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 88.66%

Average F1 score across folds: 34.35%

Average Confusion Matrix across folds: 
 [[1000.7   35.5]
 [  96.9   34.9]]
