In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text

from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate, GRU
from keras.callbacks import EarlyStopping


import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
datasets = {
    "davidson" : pd.read_csv("datasets\model_training\davidson_p.csv"),
    "hateval" : pd.read_csv("datasets\model_training\hateval_p.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ethos_p.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\jigsaw_p.csv"),
    "qian": pd.read_csv("datasets\model_training\qian_p.csv")
}

datasets_r = {
    "davidson" : pd.read_csv("datasets\model_training\davidson_r.csv"),
    "hateval" : pd.read_csv("datasets\model_training\hateval_r.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ethos_r.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\jigsaw_r.csv"),
    "qian": pd.read_csv("datasets\model_training\qian_r.csv")
}


In [66]:
datasets_train = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_train.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_train.csv")
}
datasets_test = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_test.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_test.csv")
}

In [2]:
def extractXy(df):
    X = df['text'].astype("str").tolist()
    X = np.array(X).reshape(len(X), 1)
    y = pd.get_dummies(df['class']).values.astype(int)
    return X, y

def feature_rep(df):
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
    tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3"
    
    X = df['text'].astype("str").tolist()
    X = np.array(X).reshape(len(X), 1)
    y = pd.get_dummies(df['class']).values.astype(int)
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing', trainable=False)
    encoder_inputs = preprocessing_layer(input_layer)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
    feature_rep_end = encoder(encoder_inputs)['sequence_output']
    
    return input_layer,feature_rep_end, X, y

In [3]:
class CNN_GRU_Model: # Model from Zhang et al.
    def __init__(self, input_layer, feature_rep_end):
        self.input_layer = input_layer
        self.feature_rep_end = feature_rep_end

    def build_model(self):

        conv_layer = Conv1D(filters=100, kernel_size=4, activation='relu')(self.feature_rep_end)
        max_pool = MaxPooling1D(pool_size=4)(conv_layer)
        gru = GRU(100, return_sequences=True)(max_pool)
        global_pool = GlobalMaxPooling1D()(gru)
        dense1 = Dense(2, activation='softmax',kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01))(global_pool)
        model = Model(inputs=self.input_layer, outputs=dense1)
        return model

In [4]:
def train_eval_model(input_layer, feature_rep_end, X_train, y_train, X_val, y_val, batch_size=128, epochs=30, patience=3):
    model_class = CNN_GRU_Model(input_layer, feature_rep_end)
    model = model_class.build_model()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=1)
    
    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])
    
    # Evaluate the model on the training data
    y_pred = model.predict(X_train)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_train, axis=1)
    report = classification_report(y_true, y_pred)
    print(report)

    return model

def eval(model, X_val, y_val):
    
    y_pred = model.predict(X_val)
    y_pred = np.argmax(y_pred, axis=1)
    y_val = np.argmax(y_val, axis=1)

    precision = precision_score(y_val, y_pred, average='binary')
    recall = recall_score(y_val, y_pred, average='binary')
    f1_macro_score = f1_score(y_val, y_pred, average='macro')
    f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
    report = classification_report(y_val, y_pred)

    print("Precision for Hate Class:", precision)
    print("Recall for Hate Class:", recall)
    print("F1 Macro", f1_macro_score)
    print("F1 Weighted", f1_weighted_score)
    print(round(precision,2), "/",round(recall,2), "/", round(f1_macro_score,2), "/", round(f1_weighted_score,2))
    print(report)
    return round(precision,2), round(recall,2), round(f1_macro_score,2), round(f1_weighted_score,2), report

In [7]:
def resamp(df):
    X = df.drop('class', axis=1)  # Features
    y = df['class']  # Target variable
    
    # Initialize the RandomOverSampler
    oversampler = RandomOverSampler(random_state=42)
    
    # Perform the oversampling
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    X_resampled["class"] = y_resampled
    return X_resampled

## Resample

In [11]:
jigsaw_hate = datasets["jigsaw"][datasets["jigsaw"]["class"] == 1]
jigsaw_normal = datasets["jigsaw"][datasets["jigsaw"]["class"] == 0]
jigsaw_small = pd.concat([jigsaw_normal.sample(n=10000),jigsaw_hate], ignore_index=True)
jigsaw_small_r = resamp(jigsaw_small)
datasets["jigsaw_small"] = jigsaw_small
datasets_r["jigsaw_small"] = jigsaw_small_r

In [12]:
qian_hate = datasets["qian"][datasets["qian"]["class"] == 1]
qian_normal = datasets["qian"][datasets["qian"]["class"] == 0]
qian_small = pd.concat([qian_normal.sample(n=5000), qian_hate.sample(n=5000)], ignore_index=True)
# qian_small_r = resamp(qian_small)
datasets["qian_small"] = qian_small
# datasets_r["qian_small"] = qian_small_r

## Train and Eval

In [5]:
dataset_name = "combined"

In [9]:
input_layer,feature_rep_end, X_train, y_train = feature_rep(datasets_train[dataset_name])
X_val, y_val = extractXy(datasets_test[dataset_name])
model = train_eval_model(input_layer,feature_rep_end, X_train, y_train,X_val, y_val, batch_size=128, patience=3)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     23253
           1       0.83      0.53      0.65      6846

    accuracy                           0.87     30099
   macro avg       0.85      0.75      0.78     30099
weighted avg       0.87      0.87      0.86     30099



In [98]:

results = eval(model, X_val, y_val)

Precision for Hate Class: 0.8189749182115594
Recall for Hate Class: 0.8344444444444444
F1 Macro 0.8869940707992034
F1 Weighted 0.9194938627849395
0.82 / 0.83 / 0.89 / 0.92
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      3000
           1       0.82      0.83      0.83       900

    accuracy                           0.92      3900
   macro avg       0.88      0.89      0.89      3900
weighted avg       0.92      0.92      0.92      3900



In [10]:
model.save_weights(f"weights\ensemble\{dataset_name}.h5")

## Ensemble Prep

In [11]:
for key in datasets_train:
    # # Combined Train
    # X_train, y_train = extractXy(datasets_train[key])
    # train_pred = model.predict(X_train)
    # datasets_train[key][dataset_name] = train_pred[:,1]
    # Combined Test
    X_test, y_test = extractXy(datasets_test[key])
    test_pred = model.predict(X_test)
    datasets_test[key][dataset_name] = test_pred[:,1]




In [12]:
for key in datasets_train:
    datasets_train[key].to_csv(f"datasets/model_training/ensemble/{key}_ensemble_train2.csv",header=True, index=False)
    datasets_test[key].to_csv(f"datasets/model_training/ensemble/{key}_ensemble_test2.csv",header=True, index=False)

## Extra

In [7]:
datasets_train = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_train.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_ensemble_train.csv")
}
datasets_test = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_test.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_ensemble_test.csv")
}

In [56]:
datasets_train_d = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_train.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_train.csv")
}
datasets_test_d = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_test.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_test.csv")
}

In [57]:
for key in datasets_train:
    # Combined Train

    datasets_train_d[key][dataset_name] = datasets_train[key][key]
    # Combined Test
    datasets_test_d[key][dataset_name] = datasets_test[key][key]

In [46]:
datasets_train = datasets_train_d
datasets_test = datasets_test_d

In [95]:
datasets_test["ethos"]

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw
0,0,When are people going to stop asking Why are t...,0.022557,0.462743,0.024089,0.001241
1,0,What happened to rap This right here This is rap,0.003662,0.240539,0.001801,0.003826
2,0,They are forgetting Russell Crowe s performanc...,0.145440,0.035060,0.005101,0.002382
3,0,It s killing me seeing gays treated like that,0.961536,0.253825,0.269297,0.996753
4,0,Tell me where to ship this comfy electric chai...,0.001930,0.024079,0.001312,0.082555
...,...,...,...,...,...,...
294,1,The world would be better without white people...,0.894410,0.928982,0.014217,0.990977
295,1,Teaching my kids transgenders are anything but...,0.033007,0.194156,0.112746,0.830763
296,1,destroy islam,0.006779,0.664770,0.015431,0.000672
297,1,Islam is pig shit,0.665261,0.262201,0.859333,0.996764


In [18]:
name = "qian"
y_pred = np.round(datasets_test[name]["combined"].to_numpy())
y_val = np.round(datasets_test[name]["class"].to_numpy())

precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1_macro_score = f1_score(y_val, y_pred, average='macro')
f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
report = classification_report(y_val, y_pred)

print("Precision for Hate Class:", precision)
print("Recall for Hate Class:", recall)
print("F1 Macro", f1_macro_score)
print("F1 Weighted", f1_weighted_score)
print(round(precision,2), "/",round(recall,2), "/", round(f1_macro_score,2), "/", round(f1_weighted_score,2))

Precision for Hate Class: 0.8494055482166446
Recall for Hate Class: 0.7144444444444444
F1 Macro 0.8578537218619089
F1 Weighted 0.901874209298717
0.85 / 0.71 / 0.86 / 0.9
