In [579]:
import pandas as pd
import os
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import TreebankWordTokenizer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import warnings
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

warnings.filterwarnings(action='ignore')

In [3]:
meta = pd.read_csv('annotations_metadata.csv')

In [4]:
meta

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label
0,12834217_1,572066,1346,0,noHate
1,12834217_2,572066,1346,0,noHate
2,12834217_3,572066,1346,0,noHate
3,12834217_4,572066,1346,0,hate
4,12834217_5,572066,1346,0,noHate
...,...,...,...,...,...
10939,33676864_5,734541,1388,0,noHate
10940,33677019_1,735154,1388,0,noHate
10941,33677019_2,735154,1388,0,noHate
10942,33677053_1,572266,1388,0,hate


In [52]:
train = pd.DataFrame(columns=['file_id', 'text'])
for(root, directories, files) in os.walk('all_files/'):
    for file in files:
        filepath = os.path.join(root, file)
        with open(filepath, 'rb') as f:
            new_data = pd.DataFrame({'file_id': [file[:-4]], 'text': [f.read().decode('utf-8')]})
            train = pd.concat([train, new_data], ignore_index=True)

In [53]:
train

Unnamed: 0,file_id,text
0,12834217_1,"As of March 13th , 2014 , the booklet had been..."
1,12834217_10,Thank you in advance. : ) Download the youtube...
2,12834217_2,In order to help increase the booklets downloa...
3,12834217_3,( Simply copy and paste the following text int...
4,12834217_4,Click below for a FREE download of a colorfull...
...,...,...
10939,33677015_1,Apparently he came to the conclusion that his ...
10940,33677019_1,Wish we at least had a Marine Le Pen to vote f...
10941,33677019_2,Its like the choices are white genocide candid...
10942,33677053_1,Why White people used to say that sex was a si...


In [54]:
train = pd.merge(meta, train, on='file_id')
train

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been..."
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...
...,...,...,...,...,...,...
10939,33676864_5,734541,1388,0,noHate,"Billy - `` That guy would n't leave me alone ,..."
10940,33677019_1,735154,1388,0,noHate,Wish we at least had a Marine Le Pen to vote f...
10941,33677019_2,735154,1388,0,noHate,Its like the choices are white genocide candid...
10942,33677053_1,572266,1388,0,hate,Why White people used to say that sex was a si...


In [55]:
train['user_id'].nunique()

2792

In [90]:
train.groupby('label').groups.keys()

dict_keys(['hate', 'idk/skip', 'noHate', 'relation'])

In [433]:
le = LabelEncoder()

train['label_en'] = train['label'].map({'noHate': 0, 'hate': 1, 'relation': 0, 'idk/skip': 0})

In [434]:
train

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text,label_en
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...",0
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,0
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,0
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,1
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0
...,...,...,...,...,...,...,...
10939,33676864_5,734541,1388,0,noHate,"Billy - `` That guy would n't leave me alone ,...",0
10940,33677019_1,735154,1388,0,noHate,Wish we at least had a Marine Le Pen to vote f...,0
10941,33677019_2,735154,1388,0,noHate,Its like the choices are white genocide candid...,0
10942,33677053_1,572266,1388,0,hate,Why White people used to say that sex was a si...,1


In [440]:
train = pd.merge(train, train.groupby('user_id').mean()['label_en'], on='user_id')
train

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text,label_en_x,label_en_y
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...",0,0.047619
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,0,0.047619
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,0,0.047619
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,1,0.047619
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0,0.047619
...,...,...,...,...,...,...,...,...
10939,33676864_1,734541,1388,0,noHate,This is what Pierre Trudeau gave Canadians once .,0,0.000000
10940,33676864_2,734541,1388,0,noHate,I ca n't find an actual picture of it anywhere...,0,0.000000
10941,33676864_3,734541,1388,0,noHate,Trudeau Saluteunknown ( -0.191 ) Another way t...,0,0.000000
10942,33676864_4,734541,1388,0,noHate,This is after a famous incident of former Prim...,0,0.000000


In [442]:
train = train.rename(columns={'label_en_x': 'label_en', 'label_en_y': 'hate_ratio'})
train

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text,label_en,hate_ratio
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...",0,0.047619
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,0,0.047619
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,0,0.047619
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,1,0.047619
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0,0.047619
...,...,...,...,...,...,...,...,...
10939,33676864_1,734541,1388,0,noHate,This is what Pierre Trudeau gave Canadians once .,0,0.000000
10940,33676864_2,734541,1388,0,noHate,I ca n't find an actual picture of it anywhere...,0,0.000000
10941,33676864_3,734541,1388,0,noHate,Trudeau Saluteunknown ( -0.191 ) Another way t...,0,0.000000
10942,33676864_4,734541,1388,0,noHate,This is after a famous incident of former Prim...,0,0.000000


In [531]:
def StaticModel_noexo(hidden_dim=64,
                      tweet_dim=50,
                      feature_dim=616,
                      num_users=500,
                      dropout=0.3,
                      **kwargs):
    '''Creates static prediction model. 
    'num_users' is the largest number of users can be taken (retweeters+followers). 
    Prediction is done every time step independently. 
    'feature_dim' is the unnormalized features (used previously for logreg).'''
    features = tf.keras.layers.Input(shape=(num_users, feature_dim))
    root_tweet1 = tf.keras.layers.Input(shape=(tweet_dim, ))
    norm_feature = tf.keras.layers.LayerNormalization()(features)
    int_feature = tf.keras.layers.Dense(hidden_dim,
                                        activation='relu')(norm_feature)
    int_feature = tf.keras.layers.Dropout(dropout)(int_feature)

    root_tweet = tf.keras.layers.RepeatVector(num_users)(root_tweet1)
    root_tweet = tf.keras.layers.Dense(hidden_dim,
                                        activation='relu')(root_tweet)

    print(root_tweet.shape)
    print(int_feature.shape)
    root_tweet = root_tweet[:,0,:]
    int_feature = int_feature[:,0,:]
    full_feature = tf.keras.layers.Concatenate(axis=-1)(
        [root_tweet, int_feature])
    norm_full_feature = tf.keras.layers.LayerNormalization()(full_feature)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(norm_full_feature)
    return tf.keras.models.Model([features, root_tweet1], out)

In [443]:
X = train[['user_id', 'subforum_id', 'hate_ratio','text']].values
y = train[['label_en']].values

In [445]:
opt=tf.keras.optimizers.Adam(learning_rate=0.01) # FOR STATIC
loss_function = tf.keras.losses.CategoricalCrossentropy()

In [75]:
i = 2  ## SET to 2 for static
n_batch = 16  ## SET to 16 for static
n_epoch = 10

In [566]:
num_users = len(X)

docs = X[:,3]

tokenizer = TreebankWordTokenizer()
docs = [tokenizer.tokenize(sentence) for sentence in docs]
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
d2v = Doc2Vec(tagged_docs, vector_size=5)
d2v_features = d2v_all.dv.vectors

hr_features = X[:,2].astype(np.float32)
hr_features = hr_features.reshape(num_users,1)

In [564]:
train_model = StaticModel_noexo(
    num_users=num_users,
    feature_dim=1,
    tweet_dim=5)  #Static or non-exogenous variants called similarly.
train_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=opt)

_ = train_model.fit([hr_features, d2v_features],
                    y,
                    batch_size=n_batch,
                    epochs=n_epoch)

(None, 10944, 64)
(None, 10944, 64)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [567]:
y_predict = train_model.predict([hr_features, d2v_features])



In [568]:
y_predict_label = [[int(y > 0.5)] for y in y_predict]

In [594]:
m = tf.keras.metrics.Accuracy()
m.update_state(y, y_predict_label)
print("ACC = {}\n".format(m.result().numpy()))

m = tf.keras.metrics.AUC()
m.update_state(y, y_predict_label)
m.result().numpy()
print("AUC = {}\n".format(m.result().numpy()))

ACC = 0.8905336260795593

AUC = 0.49989742040634155



In [570]:
features = tf.concat([hr_features, d2v_features], -1)

In [593]:
# Logistic Reg
clf = LogisticRegression(random_state=0).fit(features, y)
pred_labels = clf.predict(features)
print("ACC = {}\n".format(clf.score(features, y)))
print("AUC = {}\n".format(roc_auc_score(y, pred_labels)))

# Decision Trees
clf = tree.DecisionTreeClassifier(random_state=0,
                                  max_depth=10).fit(features,
                                                    y)
pred_labels = clf.predict(features)
print("ACC = {}\n".format(clf.score(features, y)))
print("AUC = {}\n".format(roc_auc_score(y, pred_labels)))

# RandomForest
clf = RandomForestClassifier(n_estimators=50).fit(features, y)
pred_labels = clf.predict(features)
print("ACC = {}\n".format(clf.score(features, y)))
print("AUC = {}\n".format(roc_auc_score(y, pred_labels)))

ACC = 0.9156615497076024

AUC = 0.7010543625791347

ACC = 0.9399671052631579

AUC = 0.7425725266687069

ACC = 0.9998172514619883

AUC = 0.9991638795986622

