In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text

from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate, GRU



import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [3]:
datasets = {
    "davidson" : pd.read_csv("datasets\model_training\davidson_p.csv"),
    "hateval" : pd.read_csv("datasets\model_training\hateval_p.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ethos_p.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\jigsaw_p.csv"),
    "qian": pd.read_csv("datasets\model_training\qian_p.csv")
}

datasets_r = {
    "davidson" : pd.read_csv("datasets\model_training\davidson_r.csv"),
    "hateval" : pd.read_csv("datasets\model_training\hateval_r.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ethos_r.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\jigsaw_r.csv"),
    "qian": pd.read_csv("datasets\model_training\qian_r.csv")
}

In [38]:
dataset_name = "qian"
selected_df = datasets[dataset_name]
ratio = len(selected_df[selected_df["class"] == 1]) / (len(selected_df[selected_df["class"] == 0]) + len(selected_df[selected_df["class"] == 1]))
hate_num = round(len(selected_df) * ratio)
non_num = len(selected_df)- hate_num

print(hate_num)
if len(selected_df[selected_df["class"] == 1]) < 3000:
    hate_df = selected_df[selected_df["class"] == 1]
    non_num = 10000 - len(hate_df)
else:
    if hate_num > 3000:
        hate_df = selected_df[selected_df["class"] == 1].sample(n=3000)
    else:
        hate_df = selected_df[selected_df["class"] == 1].sample(n=hate_num)

if len(selected_df[selected_df["class"] == 0]) < 10000:
    non_df = selected_df[selected_df["class"] == 0]
else:
    if non_num > 10000:
        non_df = selected_df[selected_df["class"] == 0].sample(n=10000)
    else:
        non_df = selected_df[selected_df["class"] == 0].sample(n=non_num)

print("Total")
print(len(non_df))
print(len(hate_df))

hate_df_test = hate_df.sample(frac=0.3)
non_df_test = non_df.sample(frac=0.3)

print("Test")
print(len(non_df_test))
print(len(hate_df_test))

hate_df_train = hate_df.loc[~hate_df.index.isin(hate_df_test.index)]
non_df_train = non_df.loc[~non_df.index.isin(non_df_test.index)]

print("Train")
print(len(non_df_train))
print(len(hate_df_train))

train_df = pd.concat([non_df_train,hate_df_train], ignore_index=True)
test_df = pd.concat([non_df_test,hate_df_test], ignore_index=True)

print(train_df)
print(test_df)

19859
Total
10000
3000
Test
3000
900
Train
7000
2100
      class                                               text
0         0                           The correct term is Gunt
1         0  This sub evolved because people here connected...
2         0                              Good morning Brit Fam
3         0  I was gonna try to think of some witty and pit...
4         0  Can t believe I just did this All you stoners ...
...     ...                                                ...
9095      1  Fuck that nigger demon Yahweh Craw back up it ...
9096      1  brings up kike shill porn spaming throws kike ...
9097      1  Hillary Klintoon lost to a half nigger for the...
9098      1  why would any bitch accuse a 9yo boy of sexual...
9099      1      the left s new favorite N word is Nigger Nazi

[9100 rows x 2 columns]
      class                                               text
0         0  Clbuttic Mistake I am embarrbutted I had to se...
1         0  Yeah the GOP shills are goi

In [39]:
train_df.to_csv(f"datasets/model_training/ensemble/{dataset_name}_train.csv",header=True, index=False)
test_df.to_csv(f"datasets/model_training/ensemble/{dataset_name}_test.csv",header=True, index=False)

In [44]:
datasets_train = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_train.csv")
}
datasets_test = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_test.csv")
}

In [45]:
combined_df_train = pd.concat([datasets_train["davidson"],datasets_train["hateval"], datasets_train["ethos"],datasets_train["jigsaw"],datasets_train["qian"]], ignore_index=True)
combined_df_test = pd.concat([datasets_test["davidson"],datasets_test["hateval"], datasets_test["ethos"],datasets_test["jigsaw"],datasets_test["qian"]], ignore_index=True)

In [48]:
combined_df_train.to_csv(f"datasets/model_training/ensemble/combined_train.csv",header=True, index=False)
combined_df_test.to_csv(f"datasets/model_training/ensemble/combined_test.csv",header=True, index=False)

Unnamed: 0,class,text
0,0,It s a Bob Schieffer holiday now he s proud an...
1,0,I m so mad cause hoes think it s funny
2,0,Nee dat is na langdurig gepest worden Zoals ik...
3,0,I be thinkin Errbody wanna give me some pussy
4,0,Fucking pussy
...,...,...
12894,1,Pussies If you wanted to sell this fake ass ho...
12895,1,Liberals are NAZIs I don t remember National S...
12896,1,And leading the protests are niggers with unma...
12897,1,that s retarded


In [4]:
datasets_train = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_train.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_train.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_train.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_train.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_train.csv")
}
datasets_test = {
    "davidson" : pd.read_csv("datasets\model_training\ensemble\davidson_ensemble_test.csv"),
    "hateval" : pd.read_csv("datasets\model_training\ensemble\hateval_ensemble_test.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ensemble\ethos_ensemble_test.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\ensemble\jigsaw_ensemble_test.csv"),
    "qian": pd.read_csv("datasets\model_training\ensemble\qian_ensemble_test.csv")
}

In [7]:
datasets_train["hateval"]["source"] = "hateval"

In [23]:
for key in datasets_train:
    datasets_train[key]["source"] = key

for key in datasets_test:
    datasets_test[key]["source"] = key

In [25]:
datasets_test["hateval"]

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
0,0,How many Somali refugees has the US admitted s...,0.670545,0.552829,0.203835,0.931424,0.001695,hateval
1,0,When your ex leaves her shoes at your place,0.001872,0.005214,0.001271,0.000638,0.001323,hateval
2,0,A Father s Belief A Sicilian immigrant who gre...,0.010373,0.011578,0.005818,0.107636,0.002418,hateval
3,0,I got offered once by a neighbor to go to a ch...,0.359146,0.046469,0.017413,0.008567,0.002281,hateval
4,0,Pregnancy Periods Breast Cancer Being Walked O...,0.441231,0.440847,0.049182,0.654126,0.002361,hateval
...,...,...,...,...,...,...,...,...
2695,1,Everyone knows women are too dramatic and hyst...,0.002097,0.286621,0.005693,0.015961,0.001486,hateval
2696,1,Deport them all Massive Anti Islamic Immigrati...,0.136424,0.982873,0.317929,0.042358,0.001493,hateval
2697,1,She s an uppity racist bitch and those women a...,0.214864,0.996455,0.166578,0.977386,0.177586,hateval
2698,1,what did we learn by Greenland leaving the EU ...,0.900417,0.946434,0.082905,0.005601,0.002899,hateval


In [26]:
def clean_df(df):
    new_df = df.dropna(subset=['text'])
        
    new_df = new_df[new_df['text'] != '']
    return new_df

In [27]:
new_train_dict = {}
new_test_dict = {}

In [28]:
for key in datasets_train:
    new_train_dict[key] = clean_df(datasets_train[key])
    new_test_dict[key] = clean_df(datasets_test[key])

In [29]:
new_train_dict

{'davidson':       class                                               text  davidson  \
 0         0  Lmao chill spook You think you Better than nig...  0.808160   
 1         0                               chris brown is trash  0.003686   
 2         0  Having sex with a guy wont keep him around for...  0.002583   
 3         0                 You see me shining on yo bitch ass  0.002031   
 4         0                       Bum bitch won t even my type  0.001948   
 ...     ...                                                ...       ...   
 6995      1  they re working on a bill to prevent retards f...  0.776276   
 6996      1                this guy is the biggest faggot omfg  0.956059   
 6997      1  which one of these names is more offensive kik...  0.846653   
 6998      1                                 you re all niggers  0.974532   
 6999      1  you re such a retard i hope you get type 2 dia...  0.969451   
 
        hateval     ethos    jigsaw      qian    source  
 0  

In [30]:
df_combined_train = pd.concat([new_train_dict["davidson"], new_train_dict["hateval"],new_train_dict["ethos"],new_train_dict["jigsaw"],new_train_dict["qian"]], ignore_index=True)
df_combined_test = pd.concat([new_test_dict["davidson"], new_test_dict["hateval"],new_test_dict["ethos"],new_test_dict["jigsaw"],new_test_dict["qian"]], ignore_index=True)

In [31]:
df_combined_train

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
0,0,Lmao chill spook You think you Better than nig...,0.808160,0.360156,0.077775,0.997260,0.994904,davidson
1,0,chris brown is trash,0.003686,0.040048,0.017826,0.910892,0.003160,davidson
2,0,Having sex with a guy wont keep him around for...,0.002583,0.927659,0.012692,0.990970,0.941269,davidson
3,0,You see me shining on yo bitch ass,0.002031,0.842289,0.009289,0.089469,0.664199,davidson
4,0,Bum bitch won t even my type,0.001948,0.980811,0.064114,0.169962,0.240064,davidson
...,...,...,...,...,...,...,...,...
29985,1,Fuck that nigger demon Yahweh Craw back up it ...,0.963582,0.045365,0.161894,0.996736,0.997359,qian
29986,1,brings up kike shill porn spaming throws kike ...,0.804231,0.070465,0.260310,0.786531,0.995143,qian
29987,1,Hillary Klintoon lost to a half nigger for the...,0.751408,0.032322,0.113964,0.964383,0.986766,qian
29988,1,why would any bitch accuse a 9yo boy of sexual...,0.857041,0.853251,0.861584,0.990994,0.128043,qian


In [32]:
df_combined_test

Unnamed: 0,class,text,davidson,hateval,ethos,jigsaw,qian,source
0,0,It s a Bob Schieffer holiday now he s proud an...,0.304560,0.022032,0.003484,0.000951,0.002650,davidson
1,0,I m so mad cause hoes think it s funny,0.020686,0.943099,0.010962,0.818127,0.010625,davidson
2,0,Nee dat is na langdurig gepest worden Zoals ik...,0.005698,0.790985,0.005720,0.184293,0.001270,davidson
3,0,I be thinkin Errbody wanna give me some pussy,0.004477,0.916321,0.001047,0.898266,0.052212,davidson
4,0,Fucking pussy,0.002247,0.882538,0.068750,0.215724,0.987768,davidson
...,...,...,...,...,...,...,...,...
12837,1,Pussies If you wanted to sell this fake ass ho...,0.215794,0.361784,0.005726,0.874478,0.994767,qian
12838,1,Liberals are NAZIs I don t remember National S...,0.959181,0.948489,0.725115,0.997236,0.996324,qian
12839,1,And leading the protests are niggers with unma...,0.970064,0.964449,0.768071,0.995252,0.997240,qian
12840,1,that s retarded,0.489199,0.670454,0.083748,0.008944,0.997286,qian


In [34]:
df_combined_train.to_csv("datasets\model_training\ensemble\combined_clean_labeled_train.csv", index=None, header=True)
df_combined_test.to_csv("datasets\model_training\ensemble\combined_clean_labeled_test.csv", index=None, header=True)