In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text

from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate, GRU
from keras.callbacks import EarlyStopping


import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
datasets_train = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_train.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_ensemble_train.csv")
}
datasets_test = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_test.csv"),
    "combined": pd.read_csv("datasets\model_training\ensemble\combined_ensemble_test.csv")
}

In [3]:
new_df = datasets_train["combined"].copy()
source = [None] * len(new_df)
for key in datasets_train:
    if key != "combined":
        merged_df = pd.merge(new_df, datasets_train[key], on='text', how='left', indicator=True)
        source = [
            key if _merge == 'both' else src
            for _merge, src in zip(merged_df['_merge'], source)
        ]

new_df["source"] = source

In [4]:
print(len(new_df[new_df["source"] == "qian"]) + len(new_df[new_df["source"] == "hateval"])  + len(new_df[new_df["source"] == "jigsaw"])  + len(new_df[new_df["source"] == "davidson"])  + len(new_df[new_df["source"] == "ethos"]) )

28267


In [5]:
new_df[new_df["source"].isna()]

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
21015,0,Been seeing the video a lot and it s just exce...,0.003238,0.032303,0.001687,0.000595,0.001298,
21016,0,If they can t win to control it they want it d...,0.010647,0.949048,0.066492,0.231610,0.001391,
21017,0,Why do companies hate people who want to give ...,0.052604,0.575365,0.021553,0.001060,0.001223,
21018,0,Even psychologists are like fuck that,0.013859,0.161847,0.062706,0.065887,0.001808,
21019,0,you do realize bullets have to land somewhere ...,0.003990,0.007488,0.000253,0.000681,0.001224,
...,...,...,...,...,...,...,...,...
23467,0,Treasury Dept Official Leaked Trump Associates...,0.001599,0.005591,0.001130,0.000624,0.001304,
23468,0,Shep Smith Fanning the Flames 2423 via,0.001721,0.017017,0.006173,0.000609,0.001272,
23469,0,I think they should be going after like you sa...,0.326074,0.446462,0.023663,0.007875,0.001610,
23470,0,These are the people who think they re going t...,0.692558,0.986265,0.015629,0.694234,0.001611,


In [6]:
def transform(value):
    return np.array([1, 0]) if value == 0 else np.array([0, 1])

In [7]:
def get_all():
    X_train = datasets_train["combined"][["davidson","hateval","ethos","jigsaw","qian"]].to_numpy()
    y = datasets_train["combined"]["class"].to_numpy()
    
    y_train = np.zeros((y.size, 2))
    y_train[np.arange(y.size), y] = 1
    
    X_val= datasets_test["combined"][["davidson","hateval","ethos","jigsaw","qian"]].to_numpy()
    y = datasets_test["combined"]["class"].to_numpy()
    
    y_val = np.zeros((y.size, 2))
    y_val[np.arange(y.size), y] = 1
    return X_train, y_train, X_val, y_val

# Get all data except for the supplied dataset name as well as all ensemble features except for the supplied dataset name
def get_some(dataset_name):
    datasets_name = ["davidson","hateval","ethos","jigsaw","qian"]
    selected = []
    for name in datasets_name:
        if name != dataset_name:
            selected.append(name)
            
    selected_dataset = new_df[new_df["source"] != dataset_name]

    X_train = selected_dataset[selected].to_numpy()
    y = selected_dataset["class"].to_numpy()
    
    y_train = np.zeros((y.size, 2))
    y_train[np.arange(y.size), y] = 1
    
    X_val= selected_dataset[selected].to_numpy()
    y = selected_dataset["class"].to_numpy()
    
    y_val = np.zeros((y.size, 2))
    y_val[np.arange(y.size), y] = 1
    return X_train, y_train, X_val, y_val

In [8]:
def get_features(df):

            
    X_train = df[["davidson","hateval","ethos","jigsaw","qian"]].to_numpy()
    y = df["class"].to_numpy()
    return X_train, y

# Gets all ensemble features from the selected dataset except for the provided dataset
def get_some_features(df, dataset_name):
    datasets_name = ["davidson","hateval","ethos","jigsaw","qian"]
    selected = []
    for name in datasets_name:
        if name != dataset_name:
            selected.append(name)
            
    X_train = df[selected].to_numpy()
    y = df["class"].to_numpy()
    return X_train, y

In [9]:
def threshold(array, threshold):
    if array[1] > threshold:
        return 1
    else:
        return 0

In [10]:
dataset_name = "ethos"

In [12]:
# X_train, y_train, X_val, y_val = get_all()
X_train, y_train, X_val, y_val = get_some(dataset_name)

meta_model_nn = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(5,)),  # Change input shape based on the number of base models
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(2, activation='sigmoid')  # For binary classification
])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

meta_model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
meta_model_nn.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 7: early stopping


<keras.callbacks.History at 0x22d78b8b4f0>

In [14]:
meta_model_nn.save_weights(f"weights\ensemble\ensemble_meta.h5")

In [19]:
# X, y_true = get_features(datasets_test[dataset_name])
X, y_true = get_some_features(datasets_test[dataset_name], dataset_name)
y_pred_2d = meta_model_nn.predict(X)

y_pred = [threshold(array, 0.5) for array in y_pred_2d]
# y_val = np.argmax(y_true, axis=1)
y_val = y_true

precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1_macro_score = f1_score(y_val, y_pred, average='macro')
f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
report = classification_report(y_val, y_pred)

print("Precision for Hate Class:", precision)
print("Recall for Hate Class:", recall)
print("F1 Macro", f1_macro_score)
print("F1 Weighted", f1_weighted_score)
print(round(precision,2), "/",round(recall,2), "/", round(f1_macro_score,2), "/", round(f1_weighted_score,2))
print(report)

Precision for Hate Class: 0.37735849056603776
Recall for Hate Class: 0.40816326530612246
F1 Macro 0.633578431372549
F1 Weighted 0.7958718604498656
0.38 / 0.41 / 0.63 / 0.8
              precision    recall  f1-score   support

           0       0.88      0.87      0.88       250
           1       0.38      0.41      0.39        49

    accuracy                           0.79       299
   macro avg       0.63      0.64      0.63       299
weighted avg       0.80      0.79      0.80       299



In [10]:
def extractXy(df):
    X = df['text'].astype("str").tolist()
    X = np.array(X).reshape(len(X), 1)
    y = pd.get_dummies(df['class']).values.astype(int)
    return X, y

def feature_rep(df):
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
    tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3"
    
    X = df['text'].astype("str").tolist()
    X = np.array(X).reshape(len(X), 1)
    y = pd.get_dummies(df['class']).values.astype(int)
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing', trainable=False)
    encoder_inputs = preprocessing_layer(input_layer)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
    feature_rep_end = encoder(encoder_inputs)['sequence_output']
    
    return input_layer,feature_rep_end, X, y

In [11]:
class CNN_GRU_Model: # Model from Zhang et al.
    def __init__(self, input_layer, feature_rep_end):
        self.input_layer = input_layer
        self.feature_rep_end = feature_rep_end

    def build_model(self):

        conv_layer = Conv1D(filters=100, kernel_size=4, activation='relu')(self.feature_rep_end)
        max_pool = MaxPooling1D(pool_size=4)(conv_layer)
        gru = GRU(100, return_sequences=True)(max_pool)
        global_pool = GlobalMaxPooling1D()(gru)
        dense1 = Dense(2, activation='softmax',kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01))(global_pool)
        model = Model(inputs=self.input_layer, outputs=dense1)
        return model

In [12]:
def train_eval_model(input_layer, feature_rep_end, X_train, y_train, X_val, y_val, batch_size=128, epochs=30, patience=3):
    model_class = CNN_GRU_Model(input_layer, feature_rep_end)
    model = model_class.build_model()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=1)
    
    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])
    
    # Evaluate the model on the training data
    y_pred = model.predict(X_train)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_train, axis=1)
    report = classification_report(y_true, y_pred)
    print(report)

    return model

def eval(model, X_val, y_val):
    
    y_pred = model.predict(X_val)
    y_pred = np.argmax(y_pred, axis=1)
    y_val = np.argmax(y_val, axis=1)

    precision = precision_score(y_val, y_pred, average='binary')
    recall = recall_score(y_val, y_pred, average='binary')
    f1_macro_score = f1_score(y_val, y_pred, average='macro')
    f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
    report = classification_report(y_val, y_pred)

    print("Precision for Hate Class:", precision)
    print("Recall for Hate Class:", recall)
    print("F1 Macro", f1_macro_score)
    print("F1 Weighted", f1_weighted_score)
    print(round(precision,2), "/",round(recall,2), "/", round(f1_macro_score,2), "/", round(f1_weighted_score,2))
    print(report)
    return round(precision,2), round(recall,2), round(f1_macro_score,2), round(f1_weighted_score,2), report

In [13]:
datasets_train = {
    "filtered": pd.read_csv("datasets\model_training\ensemble\combined_clean_labeled_train.csv")
    
}
datasets_test = {
    "filtered": pd.read_csv("datasets\model_training\ensemble\combined_clean_labeled_test.csv")
}

In [14]:
dataset_name = "qian"
selected_dataset = datasets_train["filtered"][datasets_train["filtered"]["source"] != dataset_name]
# selected_dataset = datasets_train["filtered"]

In [16]:
input_layer,feature_rep_end, X_train, y_train = feature_rep(selected_dataset)
# X_val, y_val = extractXy(datasets_test["filtered"])
X_val, y_val = extractXy(datasets_test["filtered"][datasets_test["filtered"]["source"] != dataset_name])
model = train_eval_model(input_layer,feature_rep_end, X_train, y_train,X_val, y_val, batch_size=128, epochs=10, patience=3)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     16229
           1       0.79      0.71      0.75      4739

    accuracy                           0.89     20968
   macro avg       0.85      0.83      0.84     20968
weighted avg       0.89      0.89      0.89     20968



In [17]:
X_val, y_val = extractXy(datasets_test["filtered"][datasets_test["filtered"]["source"] == dataset_name])
a,b, c, d, e = eval(model, X_val, y_val)

Precision for Hate Class: 0.6161290322580645
Recall for Hate Class: 0.6366666666666667
F1 Macro 0.7550699475810491
F1 Weighted 0.8238449140589303
0.62 / 0.64 / 0.76 / 0.82
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2961
           1       0.62      0.64      0.63       900

    accuracy                           0.82      3861
   macro avg       0.75      0.76      0.76      3861
weighted avg       0.82      0.82      0.82      3861

