In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text

from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate, GRU
from keras.callbacks import EarlyStopping 


import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
datasets_train = {
    "filtered": pd.read_csv("datasets\model_training\ensemble\combined_clean_labeled_train.csv")
    
}
datasets_test = {
    "filtered": pd.read_csv("datasets\model_training\ensemble\combined_clean_labeled_test.csv")
}

In [3]:
def transform(value):
    return np.array([1, 0]) if value == 0 else np.array([0, 1])

In [4]:
def get_all(df):
    X_train = df[["davidson","hateval","ethos","jigsaw","qian"]].to_numpy()
    y = df["class"].to_numpy()
    y = y.reshape(-1, 1)
    
    return X_train, y
    
def get_some(df,dataset_name):
    datasets_name = ["davidson","hateval","ethos","jigsaw","qian"]
    selected = []
    for name in datasets_name:
        if name != dataset_name:
            selected.append(name)
            
    selected_dataset = df[(df["source"] != dataset_name) & (df["source"].notnull())]
    X_train = selected_dataset[selected].to_numpy()
    y = selected_dataset["class"].to_numpy()
    y = y.reshape(-1, 1)

    return X_train, y

In [5]:
def threshold(array, threshold):
    if array[1] > threshold:
        return 1
    else:
        return 0

In [6]:
def get_all_tfidf(df):
    
    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Fit and transform the "text" column
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
    
    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)
    return tfidf_tensor, tfidf_vectorizer
    
def get_some_tfidf(df, dataset_name):
    datasets_name = ["davidson","hateval","ethos","jigsaw","qian"]
    selected = []
    for name in datasets_name:
        if name != dataset_name:
            selected.append(name)
            
    selected_dataset = df[(df["source"] != dataset_name) & (df["source"].notnull())]
    
    # Initialize the TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Fit and transform the "text" column
    tfidf_matrix = tfidf_vectorizer.fit_transform(selected_dataset['text'])
    
    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)
    return tfidf_tensor, tfidf_vectorizer

In [7]:
def get_features(df):

            
    X_train = df[["davidson","hateval","ethos","jigsaw","qian"]].to_numpy()
    y = df["class"].to_numpy()
    y = y.reshape(-1, 1)
    return X_train, y

# Gets all ensemble features from the selected dataset except for the provided dataset
def get_some_features(df, dataset_name):
    datasets_name = ["davidson","hateval","ethos","jigsaw","qian"]
    selected = []
    for name in datasets_name:
        if name != dataset_name:
            selected.append(name)
    
    X_train = df[selected].to_numpy()
    y = df["class"].to_numpy()
    y = y.reshape(-1, 1)
    return X_train, y

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [9]:
class SimpleFeedForwardNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleFeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 5000)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(5000, 1000)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(1000, 500)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(500, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        out = self.softmax(out)
        return out


In [29]:
dataset_name = "davidson"
all_option = 1 # 1 = All datasets, all features, 2 = All datasets, missing 1 feature. 3 = all but 1 dataset, missing 1 feature
option = 0 # 0 = ALL, 1 = only 1, 4 feature, 2 = all features 1 dataset,3 = 4 feature all but 1 dataset

if all_option == 1:
    tfidf_tensor, tfidf_vectorizer = get_all_tfidf(datasets_train["filtered"])
    X_train, y_train = get_all(datasets_train["filtered"])
elif all_option == 2:
    tfidf_tensor, tfidf_vectorizer = get_some_tfidf(datasets_train["filtered"], dataset_name)
    X_train, y_train = get_some(datasets_train["filtered"], dataset_name)
else:
    tfidf_tensor, tfidf_vectorizer = get_some_tfidf(datasets_train["filtered"][datasets_train["filtered"]["source"] != dataset_name], dataset_name)
    X_train, y_train = get_some(datasets_train["filtered"], dataset_name)

if option == 0:
    val_tfidf_tensor = tfidf_vectorizer.transform(datasets_test["filtered"]["text"])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_all(datasets_test["filtered"])
elif option == 1:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] == dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_some_features(target_df,dataset_name)
elif option == 3:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] != dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_some_features(target_df,dataset_name)
else:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] == dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_features(target_df)


X_val, y_val = torch.from_numpy(X_val), torch.from_numpy(y_val)
X_val = X_val.type(torch.float)
y_val = y_val.type(torch.float)
output_size = X_val.shape[1]
print(X_val.shape)
print(val_tfidf_tensor.shape)
# Concat tfidf from text
X_val = torch.cat((val_tfidf_tensor, X_val), dim=1)

# Create a DataLoader
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=True)


torch.Size([12842, 5])
torch.Size([12842, 43829])


In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

num_epochs = 100
learning_rate = 0.0001
n = tfidf_tensor.shape[1]
patience = 10


# Get ensemble data
X_train, y_train = torch.from_numpy(X_train), torch.from_numpy(y_train)
X_train = X_train.type(torch.float)
y_train = y_train.type(torch.float)
output_size = X_train.shape[1]

# Concat tfidf from text
X_train = torch.cat((tfidf_tensor, X_train), dim=1)

# Create a DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=2048, shuffle=True)
# val_loader = train_loader

# Instantiate the model, define the loss function and the optimizer
model = SimpleFeedForwardNN(n, output_size)
model.to(device)
criterion = nn.BCEWithLogitsLoss()  # For regression. Use nn.CrossEntropyLoss() for classification.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

best_val_loss = float('inf')
early_stop_counter = 0
best_model_state = None

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        tfidf_input = inputs[:, :n]
        extra_input = inputs[:, n:]
        
        outputs = model(tfidf_input)
        ensemble_output = torch.sum(outputs * extra_input, dim=1, keepdim=True)

        loss = criterion(ensemble_output, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            tfidf_input = inputs[:, :n]
            extra_input = inputs[:, n:]
            
            outputs = model(tfidf_input)
            ensemble_output = torch.sum(outputs * extra_input, dim=1, keepdim=True)
            
            loss = criterion(ensemble_output, targets)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}')

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        best_model_state = model.state_dict()  # Save the best model state
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

# Load the best model state (if early stopping was triggered)
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print('Loaded best model state from epoch with lowest validation loss.')

Using device: cuda
Epoch [1/100], Loss: 0.7468, Val Loss: 0.7165
Epoch [2/100], Loss: 0.6869, Val Loss: 0.7098
Epoch [3/100], Loss: 0.7088, Val Loss: 0.6956
Epoch [4/100], Loss: 0.7051, Val Loss: 0.6929
Epoch [5/100], Loss: 0.6602, Val Loss: 0.6919
Epoch [6/100], Loss: 0.7201, Val Loss: 0.6917
Epoch [7/100], Loss: 0.7057, Val Loss: 0.6900
Epoch [8/100], Loss: 0.6930, Val Loss: 0.6877
Epoch [9/100], Loss: 0.6632, Val Loss: 0.6837
Epoch [10/100], Loss: 0.6702, Val Loss: 0.6808
Epoch [11/100], Loss: 0.6762, Val Loss: 0.6802
Epoch [12/100], Loss: 0.6765, Val Loss: 0.6803
Epoch [13/100], Loss: 0.6981, Val Loss: 0.6806
Epoch [14/100], Loss: 0.7365, Val Loss: 0.6808
Epoch [15/100], Loss: 0.6664, Val Loss: 0.6805
Epoch [16/100], Loss: 0.6855, Val Loss: 0.6818
Epoch [17/100], Loss: 0.6358, Val Loss: 0.6804
Epoch [18/100], Loss: 0.6971, Val Loss: 0.6816
Epoch [19/100], Loss: 0.7216, Val Loss: 0.6824
Epoch [20/100], Loss: 0.6875, Val Loss: 0.6830
Epoch [21/100], Loss: 0.6833, Val Loss: 0.6825
Ear

In [36]:
dataset_name = "qian"

model.eval()

option = 2 # 0 = ALL, 1 = only 1, 4 feature, 2 = all features 1 dataset,3 = 4 feature all but 1 dataset

if option == 0:
    val_tfidf_tensor = tfidf_vectorizer.transform(datasets_test["filtered"])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_all(datasets_test["filtered"])
elif option == 1:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] == dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_some_features(target_df,dataset_name)
elif option == 3:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] != dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_some_features(target_df,dataset_name)
else:
    target_df = datasets_test["filtered"][datasets_test["filtered"]["source"] == dataset_name]
    # target_df = target_df.dropna(subset=['text'])
    # target_df = target_df[target_df['text'] != '']
    
    val_tfidf_tensor = tfidf_vectorizer.transform(target_df['text'])
    val_tfidf_tensor = torch.tensor(val_tfidf_tensor.toarray(), dtype=torch.float32)
    X_val, y_val = get_features(target_df)


X_val, y_val = torch.from_numpy(X_val), torch.from_numpy(y_val)
X_val = X_val.type(torch.float)
y_val = y_val.type(torch.float)
output_size = X_val.shape[1]

# Concat tfidf from text
X_val = torch.cat((val_tfidf_tensor, X_val), dim=1)

# Create a DataLoader
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=True)


y_true = []
y_pred = []
with torch.no_grad():
    for inputs, targets in val_loader:  # Use your validation/test loader here
        inputs, targets = inputs.to(device), targets.to(device)
        tfidf_input = inputs[:, :n]
        extra_input = inputs[:, n:]

        # Forward pass
        outputs = model(tfidf_input)
        ensemble_output = torch.sum(outputs * extra_input, dim=1, keepdim=True)

        # Convert probabilities to predicted class (0 or 1)
        predicted = (ensemble_output > 0.5).float()
        y_true.extend(targets.tolist())
        y_pred.extend(predicted.tolist())

y_val = y_true

precision = precision_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
f1_macro_score = f1_score(y_val, y_pred, average='macro')
f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
report = classification_report(y_val, y_pred)

print("Precision for Hate Class:", precision)
print("Recall for Hate Class:", recall)
print("F1 Macro", f1_macro_score)
print("F1 Weighted", f1_weighted_score)
print(round(precision,2), "/",round(recall,2), "/", round(f1_macro_score,2), "/", round(f1_weighted_score,2))
print(report)

Precision for Hate Class: 0.7954545454545454
Recall for Hate Class: 0.7388888888888889
F1 Macro 0.849152053716737
F1 Weighted 0.8934697038660049
0.8 / 0.74 / 0.85 / 0.89
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      2961
         1.0       0.80      0.74      0.77       900

    accuracy                           0.89      3861
   macro avg       0.86      0.84      0.85      3861
weighted avg       0.89      0.89      0.89      3861



In [46]:
dataset_name

'jigsaw'

In [60]:
train_df.to_csv("datasets\model_training\ensemble\combined_filtered_train.csv")
test_df.to_csv("datasets\model_training\ensemble\combined_filtered_test.csv")

In [56]:
train_df[train_df["source"].isna()]

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
21015,0,Been seeing the video a lot and it s just exce...,0.003238,0.032303,0.001687,0.000595,0.001298,
21016,0,If they can t win to control it they want it d...,0.010647,0.949048,0.066492,0.231610,0.001391,
21017,0,Why do companies hate people who want to give ...,0.052604,0.575365,0.021553,0.001060,0.001223,
21018,0,Even psychologists are like fuck that,0.013859,0.161847,0.062706,0.065887,0.001808,
21019,0,you do realize bullets have to land somewhere ...,0.003990,0.007488,0.000253,0.000681,0.001224,
...,...,...,...,...,...,...,...,...
23467,0,Treasury Dept Official Leaked Trump Associates...,0.001599,0.005591,0.001130,0.000624,0.001304,
23468,0,Shep Smith Fanning the Flames 2423 via,0.001721,0.017017,0.006173,0.000609,0.001272,
23469,0,I think they should be going after like you sa...,0.326074,0.446462,0.023663,0.007875,0.001610,
23470,0,These are the people who think they re going t...,0.692558,0.986265,0.015629,0.694234,0.001611,


In [84]:
target_df

Unnamed: 0.1,Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
115,612,0,I am the real vampire nigger,0.873000,0.018652,0.004240,0.993853,0.995353,hateval
116,613,0,Check your slutty fat girl friends instagram p...,0.033234,0.989482,0.223525,0.995148,0.995851,hateval
117,614,0,Instagram needs to stop being a bitch,0.001843,0.978749,0.005122,0.036563,0.003552,hateval
118,615,0,pussy whipped,0.003237,0.762364,0.098111,0.855310,0.936555,hateval
119,616,0,bitch don t do this RT,0.014555,0.965728,0.000753,0.001563,0.001636,hateval
...,...,...,...,...,...,...,...,...,...
3568,12518,1,On the flip a family member is getting into an...,0.007541,0.862036,0.000576,0.000611,0.844060,hateval
3569,12519,1,religion is gay loser,0.861051,0.074809,0.664744,0.994715,0.004697,hateval
3570,12520,1,If women want retarded kids let them The rest ...,0.885251,0.971841,0.090143,0.852065,0.997261,hateval
3652,12661,1,Yeah I m retarded,0.122651,0.076663,0.010756,0.001251,0.996766,hateval


In [83]:
test = datasets_train["filtered"][datasets_train["filtered"]["source"] != dataset_name]
test

Unnamed: 0.1,Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
0,0,0,Lmao chill spook You think you Better than nig...,0.808160,0.360156,0.077775,0.997260,0.994904,davidson
1,1,0,chris brown is trash,0.003686,0.040048,0.017826,0.910892,0.003160,davidson
2,2,0,Having sex with a guy wont keep him around for...,0.002583,0.927659,0.012692,0.990970,0.941269,davidson
3,3,0,You see me shining on yo bitch ass,0.002031,0.842289,0.009289,0.089469,0.664199,davidson
4,4,0,Bum bitch won t even my type,0.001948,0.980811,0.064114,0.169962,0.240064,davidson
...,...,...,...,...,...,...,...,...,...
28176,30094,1,Fuck that nigger demon Yahweh Craw back up it ...,0.963582,0.045365,0.161895,0.996736,0.997359,qian
28177,30095,1,brings up kike shill porn spaming throws kike ...,0.804230,0.070466,0.260310,0.786530,0.995143,qian
28178,30096,1,Hillary Klintoon lost to a half nigger for the...,0.751409,0.032321,0.113964,0.964383,0.986766,qian
28179,30097,1,why would any bitch accuse a 9yo boy of sexual...,0.857041,0.853251,0.861583,0.990994,0.128043,qian


In [None]:
def process_df(df):
    new_df = df.copy()
    source = [None] * len(new_df)
    for key in datasets_train:
        if key != "combined":
            merged_df = pd.merge(new_df, datasets_train[key], on='text', how='left', indicator=True)
            source = [
                key if _merge == 'both' else src
                for _merge, src in zip(merged_df['_merge'], source)
            ]
    
    new_df["source"] = source
    
    new_df = new_df.dropna(subset=['text'])
    
    new_df = new_df[new_df['text'] != '']
    return new_df

train_df = process_df(datasets_train["combined"])
test_df = process_df(datasets_test["combined"])