In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text

from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate, GRU



import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("datasets/model_training/combined.csv")

In [3]:
df

Unnamed: 0,text,class
0,Hurray saving us in so many ways,1
1,Why would young fighting age men be the vast m...,1
2,Illegals Dump their Kids at the border like Ro...,1
3,NY Times Nearly All White States Pose an Array...,0
4,Orban in Brussels European leaders are ignorin...,0
...,...,...
1503141,Antifa The Far Left Black Bloc Organization Gr...,1
1503142,It was done to a white kid Trust me they won t...,1
1503143,This is the List of Moral Alignment Chaotic Go...,1
1503144,Universal truth jew speak has to be the most r...,1


In [5]:
def feature_rep():
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
    tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3"

    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing', trainable=False)
    encoder_inputs = preprocessing_layer(input_layer)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
    feature_rep_end = encoder(encoder_inputs)['sequence_output']
    
    return input_layer,feature_rep_end

class CNN_GRU_Model: # Model from Zhang et al.
    def __init__(self, input_layer, feature_rep_end):
        self.input_layer = input_layer
        self.feature_rep_end = feature_rep_end

    def build_model(self):

        conv_layer = Conv1D(filters=100, kernel_size=4, activation='relu')(self.feature_rep_end)
        max_pool = MaxPooling1D(pool_size=4)(conv_layer)
        gru = GRU(100, return_sequences=True)(max_pool)
        global_pool = GlobalMaxPooling1D()(gru)
        dense1 = Dense(2, activation='softmax',kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01))(global_pool)
        model = Model(inputs=self.input_layer, outputs=dense1)
        return model

In [7]:
def load_model(model_file):
    input_layer,feature_rep_end = feature_rep()
    model_class = CNN_GRU_Model(input_layer,feature_rep_end)
    model = model_class.build_model()
    model.load_weights(model_file)
    return model

In [8]:
model1 = load_model('weights/hateval2.h5')
model2 = load_model('weights/davidson.h5')
model3 = load_model('weights/qian.h5')
model4 = load_model('weights/jigsaw.h5')
model5 = load_model('weights/ethos.h5')

In [11]:
smaller_df = df.sample(n=10000)


In [40]:
print(smaller_df[smaller_df["class"] == 0])
print(smaller_df[smaller_df["class"] == 1])

                                                      text  class
1190973  It s like you can t do anything without someon...      0
931711   Go bleach yourself black pussy doll and then c...      0
630078   I d be the best prostitute and madam EVER Hey ...      0
392365   Removing User Contributions You DO NOT remove ...      0
650048                            Sgp still in bitches dms      0
...                                                    ...    ...
1172762  All I know is that my disorder ruined my fucki...      0
1042016  Fat bastard we won forget you Vets will kick y...      0
678011   not one single bone in my body gives a fuck ab...      0
1135556                 Wow does ur mom know how gay u are      0
445433   Hey I saw the note you left on Essence s Talk ...      0

[4897 rows x 2 columns]
                                                      text  class
178642   According to the supporters of immigration pol...      1
766348   oh shut up faggot get a disease overused o

In [12]:
model1_pred = model1.predict(smaller_df["text"].astype("str"))
model2_pred = model2.predict(smaller_df["text"].astype("str"))
model3_pred = model2.predict(smaller_df["text"].astype("str"))
model4_pred = model2.predict(smaller_df["text"].astype("str"))
model5_pred = model2.predict(smaller_df["text"].astype("str"))





In [17]:
model1_pred = model1_pred[:, 1].reshape(-1, 1)
model2_pred = model2_pred[:, 1].reshape(-1, 1)
model3_pred = model3_pred[:, 1].reshape(-1, 1)
model4_pred = model4_pred[:, 1].reshape(-1, 1)
model5_pred = model5_pred[:, 1].reshape(-1, 1)

# Stack predictions as new features
stacked_predictions = np.hstack((model1_pred, model2_pred, model3_pred, model4_pred, model5_pred))

# Example of the stacked predictions shape
print(stacked_predictions.shape)


(10000, 5)


In [33]:
meta_model_nn = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(5,)),  # Change input shape based on the number of base models
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
])
meta_model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
meta_model_nn.fit(stacked_predictions, smaller_df["class"], epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ca9fe4da20>

In [34]:
app_df = pd.read_csv("datasets/2021-2022/2021-2022_multi_dataset8.csv")

In [35]:
pred = meta_model_nn.predict(app_df[['hateval2', 'davidson','qian','jigsaw','ethos']])



In [37]:
app_df["ensemble"] = pred

In [39]:
app_df.to_csv("datasets/2021-2022/2021-2022_multi_dataset9.csv",header=True, index=False)

## Testing

In [18]:
X_train, X_test, y_train, y_test = train_test_split(stacked_predictions, smaller_df["class"], test_size=0.2)

In [19]:
meta_model = LogisticRegression()
meta_model.fit(X_train, y_train)

# Evaluate the meta-model
meta_preds = meta_model.predict(X_test)
accuracy = accuracy_score(y_test, meta_preds)

print(f'Meta-model accuracy: {accuracy:.4f}')

Meta-model accuracy: 0.7100


In [24]:
meta_model_nn = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(5,)),  # Change input shape based on the number of base models
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
])


In [25]:
meta_model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [26]:
meta_model_nn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ca9e1a50c0>

In [30]:
meta_preds = meta_model_nn.predict(X_test)
meta_preds = (meta_preds > 0.5).astype(int).flatten()



In [32]:
print(classification_report(y_test, meta_preds))

              precision    recall  f1-score   support

           0       0.69      0.79      0.73      1008
           1       0.75      0.64      0.69       992

    accuracy                           0.71      2000
   macro avg       0.72      0.71      0.71      2000
weighted avg       0.72      0.71      0.71      2000

