In [53]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [54]:
datasets = {
    "davidson" : pd.read_csv("datasets\model_training\davidson.csv"),
    "hateval" : pd.read_csv("datasets\model_training\hateval.csv"),
    "ethos" : pd.read_csv("datasets\model_training\ethos.csv"),
    "jigsaw": pd.read_csv("datasets\model_training\jigsaw.csv"),
    "qian": pd.read_csv("datasets\model_training\qian.csv")
}

In [55]:
def pre_process(data):
    df_pm = data.lower()
    return df_pm


def sep_rem(data):
    df_pm = data
    df_pm = re.sub(r"[^a-zA-Z0-9#@ ]", " ", df_pm)
    return df_pm.strip()


def remove_hashtag(data):
    df_pm = re.sub(r"#\S+", " ", data)
    return df_pm.strip()


def remove_mentions(data):
    data = re.sub(r"@\S+", " ", data)
    return data.strip()


def remove_NCR(data):
    data = re.sub(r"&#[0-9]+;|&#x[0-9a-fA-F]+;|&[a-zA-Z]+", " ", data)
    return data.strip()


def remove_RT(data):
    data = re.sub(r"(^|\s)RT\s", "", data)
    data = re.sub(r"(^|\s)rt\s", "", data)
    return data.strip()


def remove_links(data):
    data = re.sub(r"https?://(?:[\w./])+", " ", data)
    data = re.sub(r"http?://(?:[\w./&#])+", " ", data)
    return data.strip()


def remove_spaces(data):
    data = re.sub(r" +", " ", data)
    return data.strip()


def process_data(df):
    processed_column = df['text']
    # processed_column = processed_column.apply(lambda x: pre_process(x))
    processed_column = processed_column.apply(lambda x: remove_links(x))
    processed_column = processed_column.apply(lambda x: remove_NCR(x))
    processed_column = processed_column.apply(lambda x: remove_hashtag(x))
    processed_column = processed_column.apply(lambda x: remove_mentions(x))
    processed_column = processed_column.apply(lambda x: sep_rem(x))
    processed_column = processed_column.apply(lambda x: remove_RT(x))
    processed_column = processed_column.apply(lambda x: remove_spaces(x))
    df["clean"] = processed_column
    return df

In [56]:
process_data(datasets["davidson"])
process_data(datasets["hateval"])
process_data(datasets["ethos"])
process_data(datasets["jigsaw"])
process_data(datasets["qian"])


Unnamed: 0,text,class,clean
0,A subsection of retarded Hungarians? Ohh boy. ...,1,A subsection of retarded Hungarians Ohh boy br...
1,Hiii. Just got off work. 444 is mainly the typ...,0,Hiii Just got off work 444 is mainly the typa ...
2,wow i guess soyboys are the same in every country,0,wow i guess soyboys are the same in every country
3,Owen Benjamin's soyboy song goes for every cou...,0,Owen Benjamin s soyboy song goes for every cou...
4,"> ""y'all hear sumn?"" by all means I live in a...",0,y all hear sumn by all means I live in a small...
...,...,...,...
54595,fucking retard,1,fucking retard
54596,Twitter BANNED me and reported me to the FBI f...,0,Twitter BANNED me and reported me to the FBI f...
54597,"I was routinely suspended for saying fuck, and...",1,I was routinely suspended for saying fuck and ...
54598,They got me for dropping the cunt bomb... a lot.,1,They got me for dropping the cunt bomb a lot


In [61]:
def clean(df):
    df.drop('text', axis=1, inplace=True)
    df.rename(columns={'clean': 'text'}, inplace=True)

In [62]:
clean(datasets["davidson"])
clean(datasets["hateval"])
clean(datasets["ethos"])
clean(datasets["jigsaw"])
clean(datasets["qian"])

In [64]:
datasets["jigsaw"].to_csv("datasets/model_training/jigsaw_p.csv",header=True, index=False)
datasets["hateval"].to_csv("datasets/model_training/hateval_p.csv",header=True, index=False)
datasets["davidson"].to_csv("datasets/model_training/davidson_p.csv",header=True, index=False)
datasets["ethos"].to_csv("datasets/model_training/ethos_p.csv",header=True, index=False)
datasets["qian"].to_csv("datasets/model_training/qian_p.csv",header=True, index=False)

In [45]:
def resamp(df):
    X = df.drop('class', axis=1)  # Features
    y = df['class']  # Target variable
    
    # Initialize the RandomOverSampler
    oversampler = RandomOverSampler(random_state=42)
    
    # Perform the oversampling
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    X_resampled["class"] = y_resampled
    X_resampled = X_resampled.drop('text', axis=1)
    X_resampled.rename(columns={'clean': 'text'}, inplace=True)
    return X_resampled

In [46]:
jigsaw_r = resamp(datasets["jigsaw"])
hateval_r = resamp(datasets["hateval"])
davidson_r = resamp(datasets["davidson"])
ethos_r = resamp(datasets["ethos"])
qian_r = resamp(datasets["qian"])

In [47]:
jigsaw_r.to_csv("datasets/model_training/jigsaw_r.csv",header=True, index=False)
hateval_r.to_csv("datasets/model_training/hateval_r.csv",header=True, index=False)
davidson_r.to_csv("datasets/model_training/davidson_r.csv",header=True, index=False)
ethos_r.to_csv("datasets/model_training/ethos_r.csv",header=True, index=False)
qian_r.to_csv("datasets/model_training/qian_r.csv",header=True, index=False)

In [48]:
print(len(jigsaw_r))
print(len(jigsaw_r) / len(hateval_r))
print(len(jigsaw_r) / len(davidson_r))
print(len(jigsaw_r) / len(ethos_r))
print(len(jigsaw_r) / len(qian_r))

316332
30.31742380678551
6.772834325354344
189.42035928143713
4.552718689732592


In [49]:
stacked_hateval = [hateval_r] * 30  # Creating a list with n copies of the DataFrame
stacked_hateval = pd.concat(stacked_hateval, ignore_index=True)
stacked_davidson = [davidson_r] * 6  # Creating a list with n copies of the DataFrame
stacked_davidson = pd.concat(stacked_davidson, ignore_index=True)
stacked_ethos = [ethos_r] * 189  # Creating a list with n copies of the DataFrame
stacked_ethos = pd.concat(stacked_ethos, ignore_index=True)
stacked_qian = [qian_r] * 4  # Creating a list with n copies of the DataFrame
stacked_qian = pd.concat(stacked_qian, ignore_index=True)
stacked_qian

Unnamed: 0,text,class
0,A subsection of retarded Hungarians Ohh boy br...,1
1,Hiii Just got off work 444 is mainly the typa ...,0
2,wow i guess soyboys are the same in every country,0
3,Owen Benjamin s soyboy song goes for every cou...,0
4,y all hear sumn by all means I live in a small...,0
...,...,...
277923,Antifa The Far Left Black Bloc Organization Gr...,1
277924,It was done to a white kid Trust me they won t...,1
277925,This is the List of Moral Alignment Chaotic Go...,1
277926,Universal truth jew speak has to be the most r...,1


In [50]:
stacked_df = pd.concat([stacked_hateval, jigsaw_r, stacked_davidson, stacked_ethos, stacked_qian], ignore_index=True)

In [51]:
stacked_df

Unnamed: 0,text,class
0,Hurray saving us in so many ways,1
1,Why would young fighting age men be the vast m...,1
2,Illegals Dump their Kids at the border like Ro...,1
3,NY Times Nearly All White States Pose an Array...,0
4,Orban in Brussels European leaders are ignorin...,0
...,...,...
1503141,Antifa The Far Left Black Bloc Organization Gr...,1
1503142,It was done to a white kid Trust me they won t...,1
1503143,This is the List of Moral Alignment Chaotic Go...,1
1503144,Universal truth jew speak has to be the most r...,1


In [52]:
stacked_df.to_csv("datasets/model_training/combined.csv",header=True, index=False)