<a href="https://colab.research.google.com/github/rowpep/cross-domain-hatespeech/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DATA PREPROCESSING


---

This code downloads datasets from the internet and formats them into Pandas

*   This code downloads datasets from the internet and formats them into Pandas DataFrames with columns: Class - Text - Source - PostID
*   Then the classes are made binary, where hate speech = 1 and non-hate = 0
*   Few-shot data sets are created.
*   Raw datasets are saved for BERT input
*   All texts are tokenised with spacy and then saved.


In [None]:
#LIBRARIES

import pandas as pd
import urllib
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
import json
import spacy
nlp = spacy.load("en_core_web_sm")
from google.colab import drive



Downloading datasets and standardising the formats



In [None]:
#DAVIDSON ET AL

davidson_et_al_url = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/refs/heads/master/data/labeled_data.csv"

raw_davidson_et_al_df = pd.read_csv(davidson_et_al_url, index_col=0)

davidson_et_al = raw_davidson_et_al_df.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither'])

davidson_et_al["Source"] = 'Davidson et al'
davidson_et_al["Post_ID"] = davidson_et_al.index.astype(str)

davidson_et_al = davidson_et_al.rename(columns ={"tweet": "Text", "class": "Class"})

davidson_dict = davidson_et_al.to_dict(orient='dict')

In [None]:
#HATEXPLAIN

hatexplain_url = "https://raw.githubusercontent.com/hate-alert/HateXplain/refs/heads/master/Data/dataset.json"

raw_hatexplain_df = pd.read_json(hatexplain_url)
hatexplain_df = raw_hatexplain_df.transpose().drop(columns=['rationales'])
hatexplain_df = hatexplain_df[hatexplain_df["post_id"].str.endswith("twitter").copy()]

def majority_label(annotations):
  labels = [ann['label'] for ann in annotations]
  return Counter(labels).most_common(1)[0][0]

hatexplain_df['majority_label'] = hatexplain_df['annotators'].apply(majority_label)

hatexplain_df = hatexplain_df.drop(columns=['annotators'])

hatexplain_df["Source"] = 'HateXplain'

hatexplain_df['post_tokens'] = hatexplain_df['post_tokens'].apply(lambda tokens:' '.join(tokens))

hatexplain_df = hatexplain_df.rename(columns ={"majority_label": "Class", "post_tokens": "Text", "post_id": "Post_ID"})

hatexplain_df = hatexplain_df[['Class','Text','Source','Post_ID']]

In [None]:
#WIKIPEDIA DETOX
#code taken from: https://github.com/ewulczyn/wiki-detox/blob/master/src/figshare/Wikipedia%20Talk%20Data%20-%20Getting%20Started.ipynb

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634'
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637'


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)


download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# labels a comment as an atack if the majority of annoatators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# join labels and comments
comments['attack'] = labels

# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))


raw_wikipedia_df = comments

wikipedia_df = raw_wikipedia_df.drop(columns = ['year', 'logged_in', 'sample', 'ns', 'split'])

wikipedia_df["Source"] = 'Wikipedia'
wikipedia_df["Post_ID"] = wikipedia_df.index.astype(str)

wikipedia_df = wikipedia_df.rename(columns = {"attack": "Class", "comment": "Text"})

wikipedia_df = wikipedia_df[['Class','Text','Source','Post_ID']]

In [None]:
#REDDIT SLUR CORPUS

reddit_url = 'https://raw.githubusercontent.com/networkdynamics/slur-corpus/refs/heads/main/kurrek.2020.slur-corpus.csv'

raw_reddit_df = pd.read_csv(reddit_url)

reddit_df = raw_reddit_df.drop(columns = ['link_id', 'parent_id', 'score', 'subreddit', 'slur', 'disagreement', 'author'])

reddit_df["Source"] = 'Reddit'

reddit_df = reddit_df.rename(columns = {"gold_label": "Class", "body": "Text", "id": "Post_ID"})

reddit_df = reddit_df[['Class','Text','Source','Post_ID']]

#only keep rows where the text is a string as some of them are None
reddit_df = reddit_df[reddit_df['Text'].apply(lambda x: isinstance(x, str))]

In [None]:
#GAB HATE CORPUS

!pip install osfclient
!osf -p edua3 list
!osf -p edua3 fetch osfstorage/Data/GabHateCorpus_annotations.tsv


In [None]:
raw_gab_df = pd.read_csv("GabHateCorpus_annotations.tsv", sep="\t")

gab_df = raw_gab_df.drop(columns = ['HD', 'CV', 'VO', 'REL', 'RAE', 'SXO', 'GEN', 'IDL', 'NAT', 'POL', 'MPH', 'EX', 'IM'])


#Ties are broken toward the positive class (i.e., if there’s no clear majority, the post is labeled as hate speech).
#This means:
#If annotators are split 1 hate / 1 offensive / 1 normal = it’s treated as hateful.
#If 1 hate / 2 normal = labeled normal.
#If 2 hate / 1 normal = labeled hate.

def majority_vote(hate_labels):
  counts = hate_labels.value_counts()
  if counts.get(1, 0) >= counts.get(0,0):
    return 1
  else:
    return 0


gab_df['majority_vote'] = gab_df.groupby('ID')['Hate'].transform(majority_vote)

gab_df = gab_df.drop_duplicates(subset='ID').drop(columns = ['Annotator','Hate'])

gab_df["Source"] = 'Gab'
gab_df = gab_df.rename(columns = {"ID": "Post_ID", "majority_vote": "Class"})

gab_df = gab_df[['Class','Text','Source','Post_ID']]

In [None]:
#THE DATASETS

# davidson_et_al
# hatexplain_df
# wikipedia_df
# reddit_df
# gab_df


Making the classes binary. Hate speech = 1 and non-hate = 0


In [None]:
#Davidson et al
#0 = hatespeech, 1 = offensive language, 2 = neither

bin_davidson_et_al = davidson_et_al.copy()
bin_davidson_et_al['Binary_Class'] = bin_davidson_et_al['Class'].map({0:1, 1:0, 2:0})

In [None]:
#HateXplain

bin_hatexplain_df = hatexplain_df.copy()
bin_hatexplain_df['Binary_Class'] = bin_hatexplain_df['Class'].map({"normal": 0, "offensive": 0, "hatespeech": 1})

In [None]:
#Wikipedia

bin_wikipedia_df = wikipedia_df.copy()
bin_wikipedia_df['Binary_Class'] = bin_wikipedia_df['Class'].map({False: 0, True: 1})

In [None]:
#Reddit

bin_reddit_df = reddit_df.copy()

bin_reddit_df = bin_reddit_df.dropna(subset=['Class'])

bin_reddit_df['Binary_Class'] = bin_reddit_df['Class'].map({"DEG": 1, "NDG": 0, "HOM": 0, "APR": 0, "CMP": 0})

In [None]:
#Gab

bin_gab_df = gab_df.copy()
bin_gab_df['Binary_Class'] = bin_gab_df['Class']

In [None]:
#BINARY DATASETS

# bin_davidson_et_al
# bin_hatexplain_df
# bin_wikipedia_df
# bin_reddit_df
# bin_gab_df

Creating text lists of classes and tweets for each dataset.

In [None]:
davidson_text_list = bin_davidson_et_al['Text'].tolist()
davidson_class_list = bin_davidson_et_al['Binary_Class'].tolist()

hatexplain_text_list = bin_hatexplain_df['Text'].tolist()
hatexplain_class_list = bin_hatexplain_df['Binary_Class'].tolist()

wikipedia_text_list = bin_wikipedia_df['Text'].tolist()
wikipedia_class_list = bin_wikipedia_df['Binary_Class'].tolist()

reddit_text_list = bin_reddit_df['Text'].tolist()
reddit_class_list = bin_reddit_df['Binary_Class'].tolist()

gab_text_list = bin_gab_df['Text'].tolist()
gab_class_list = bin_gab_df['Binary_Class'].tolist()

Creating few-shot samples and new datasets.

In [None]:
#Reddit few shot

#creating few-shot samples from Reddit
reddit_fewshot_text, fs_reddit_text_list, reddit_fewshot_label, fs_reddit_class_list = train_test_split(reddit_text_list, reddit_class_list, test_size=0.90, stratify=reddit_class_list, random_state=42)

#Davidson + Reddit
davidson_fs_reddit_text = reddit_fewshot_text + davidson_text_list
davidson_fs_reddit_labels = reddit_fewshot_label + davidson_class_list

#Hatexplain + Reddit
hatexplain_fs_reddit_text = reddit_fewshot_text + hatexplain_text_list
hatexplain_fs_reddit_labels = reddit_fewshot_label + hatexplain_class_list

In [None]:
#Gab few shot

#creating few-shot samples from Gab
gab_fewshot_text, fs_gab_text_list, gab_fewshot_label, fs_gab_class_list = train_test_split(gab_text_list, gab_class_list, test_size=0.90, stratify=gab_class_list, random_state=42)

#Davidson + Gab
davidson_fs_gab_text = gab_fewshot_text + davidson_text_list
davidson_fs_gab_labels = gab_fewshot_label + davidson_class_list

#HateXplain + Gab
hatexplain_fs_gab_text = gab_fewshot_text + hatexplain_text_list
hatexplain_fs_gab_labels = gab_fewshot_label + hatexplain_class_list

In [None]:
#Wikipedia few shot

#creating few-shot samples from Wikipedia
wikipedia_fewshot_text, fs_wikipedia_text_list, wikipedia_fewshot_label, fs_wikipedia_class_list = train_test_split(wikipedia_text_list, wikipedia_class_list, test_size=0.90, stratify=wikipedia_class_list, random_state=42)

#Davidson + Wikipedia
davidson_fs_wikipedia_text = wikipedia_fewshot_text + davidson_text_list
davidson_fs_wikipedia_labels = wikipedia_fewshot_label + davidson_class_list

#Hatexplain + Wikipedia
hatexplain_fs_wikipedia_text = wikipedia_fewshot_text + hatexplain_text_list
hatexplain_fs_wikipedia_labels = wikipedia_fewshot_label + hatexplain_class_list

In [None]:
#ALL OF THE DATASETS

# #Zero-shot sets

# davidson_class_list
# davidson_text_list
# hatexplain_class_list
# hatexplain_text_list
# reddit_class_list
# reddit_text_list
# gab_class_list
# gab_text_list
# wikipedia_class_list
# wikipedia_text_list

# #Few shot sets

# #Reddit sets

# fs_reddit_class_list
# fs_reddit_text_list
# hatexplain_fs_reddit_labels
# hatexplain_fs_reddit_text
# davidson_fs_reddit_labels
# davidson_fs_reddit_text

# #Gab sets

# fs_gab_text_list
# fs_gab_class_list
# hatexplain_fs_gab_labels
# hatexplain_fs_gab_text
# davidson_fs_gab_labels
# davidson_fs_gab_text

# #Wikipedia sets

# fs_wikipedia_text_list
# fs_wikipedia_class_list
# hatexplain_fs_wikipedia_labels
# hatexplain_fs_wikipedia_text
# davidson_fs_wikipedia_labels
# davidson_fs_wikipedia_text

Saving the raw datasets for use as BERT inputs.

Save files as .json.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#save all this data for BERT bc it needs raw data

#Zero-shot sets

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_class_list.json','w') as f:
  json.dump(davidson_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_text_list.json','w') as f:
  json.dump(davidson_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_class_list.json','w') as f:
  json.dump(hatexplain_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_text_list.json','w') as f:
  json.dump(hatexplain_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_reddit_class_list.json','w') as f:
  json.dump(reddit_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_reddit_text_list.json','w') as f:
  json.dump(reddit_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_gab_class_list.json','w') as f:
  json.dump(gab_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_gab_text_list.json','w') as f:
  json.dump(gab_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_wikipedia_class_list.json','w') as f:
  json.dump(wikipedia_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_wikipedia_text_list.json','w') as f:
  json.dump(wikipedia_text_list, f)

#Few shot sets

#Reddit sets

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_reddit_class_list.json','w') as f:
  json.dump(fs_reddit_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_reddit_text_list.json','w') as f:
  json.dump(fs_reddit_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_reddit_labels.json','w') as f:
  json.dump(hatexplain_fs_reddit_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_reddit_text.json','w') as f:
  json.dump(hatexplain_fs_reddit_text, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_reddit_labels.json','w') as f:
  json.dump(davidson_fs_reddit_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_reddit_text.json','w') as f:
  json.dump(davidson_fs_reddit_text, f)

#Gab sets

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_gab_class_list.json','w') as f:
  json.dump(fs_gab_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_gab_text_list.json','w') as f:
  json.dump(fs_gab_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_gab_labels.json','w') as f:
  json.dump(hatexplain_fs_gab_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_gab_text.json','w') as f:
  json.dump(hatexplain_fs_gab_text, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_gab_labels.json','w') as f:
  json.dump(davidson_fs_gab_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_gab_text.json','w') as f:
  json.dump(davidson_fs_gab_text, f)

#Wikipedia sets

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_wikipedia_class_list.json','w') as f:
  json.dump(fs_wikipedia_class_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_fs_wikipedia_text_list.json','w') as f:
  json.dump(fs_wikipedia_text_list, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_wikipedia_labels.json','w') as f:
  json.dump(hatexplain_fs_wikipedia_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_hatexplain_fs_wikipedia_text.json','w') as f:
  json.dump(hatexplain_fs_wikipedia_text, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_wikipedia_labels.json','w') as f:
  json.dump(davidson_fs_wikipedia_labels, f)

with open('/content/drive/MyDrive/DissData/bert-input/bert_davidson_fs_wikipedia_text.json','w') as f:
  json.dump(davidson_fs_wikipedia_text, f)

spaCy Tokenisation of the datasets.

In [None]:
def cleanwithspacy(text):
  doc = nlp(text)
  tokens = [token.text for token in doc
            if not token.is_stop
            and not token.is_punct
            and not token.like_url
            and token.text.strip()]
  return tokens

In [None]:
fs_reddit_text_list_spcy = [cleanwithspacy(text) for text in fs_reddit_text_list]

hatexplain_fs_reddit_text_spcy = [cleanwithspacy(text) for text in hatexplain_fs_reddit_text]

davidson_fs_reddit_text_spcy = [cleanwithspacy(text) for text in davidson_fs_reddit_text]

fs_gab_text_list_spcy = [cleanwithspacy(text) for text in fs_gab_text_list]

hatexplain_fs_gab_text_spcy = [cleanwithspacy(text) for text in hatexplain_fs_gab_text]

davidson_fs_gab_text_spcy = [cleanwithspacy(text) for text in davidson_fs_gab_text]

fs_wikipedia_text_list_spcy = [cleanwithspacy(text) for text in fs_wikipedia_text_list]

hatexplain_fs_wikipedia_text_spcy = [cleanwithspacy(text) for text in hatexplain_fs_wikipedia_text]

davidson_fs_wikipedia_text_spcy = [cleanwithspacy(text) for text in davidson_fs_wikipedia_text]

davidson_text_list_spcy = [cleanwithspacy(text) for text in davidson_text_list]

hatexplain_text_list_spcy = [cleanwithspacy(text) for text in hatexplain_text_list]

reddit_text_list_spcy = [cleanwithspacy(text) for text in reddit_text_list]

gab_text_list_spcy = [cleanwithspacy(text) for text in gab_text_list]

wikipedia_text_list_spcy = [cleanwithspacy(text) for text in wikipedia_text_list]

In [None]:
#List of all the datasets, now tokenised with spacy

# #Zero-shot sets

# davidson_class_list
# davidson_text_list_spcy
# hatexplain_class_list
# hatexplain_text_list_spcy
# reddit_class_list
# reddit_text_list_spcy
# gab_class_list
# gab_text_list_spcy
# wikipedia_class_list
# wikipedia_text_list

# #Few shot sets

# #Reddit sets

# fs_reddit_class_list
# fs_reddit_text_list
# hatexplain_fs_reddit_labels
# hatexplain_fs_reddit_text
# davidson_fs_reddit_labels
# davidson_fs_reddit_text

# #Gab sets

# fs_gab_text_list
# fs_gab_class_list
# hatexplain_fs_gab_labels
# hatexplain_fs_gab_text
# davidson_fs_gab_labels
# davidson_fs_gab_text

# #Wikipedia sets

# fs_wikipedia_text_list
# fs_wikipedia_class_list
# hatexplain_fs_wikipedia_labels
# hatexplain_fs_wikipedia_text
# davidson_fs_wikipedia_labels
# davidson_fs_wikipedia_text

Save the new datasets that will be used for all models except BERT.

Save them as .json files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#zero shot sets

with open('/content/drive/MyDrive/DissData/davidson_class_list.json','w') as f:
  json.dump(davidson_class_list, f)

with open('/content/drive/MyDrive/DissData/davidson_text_list_spcy.json','w') as f:
  json.dump(davidson_text_list_spcy, f)

with open('/content/drive/MyDrive/DissData/hatexplain_class_list.json','w') as f:
  json.dump(hatexplain_class_list, f)

with open('/content/drive/MyDrive/DissData/hatexplain_text_list_spcy.json','w') as f:
  json.dump(hatexplain_text_list_spcy, f)

with open('/content/drive/MyDrive/DissData/reddit_class_list.json','w') as f:
  json.dump(reddit_class_list, f)

with open('/content/drive/MyDrive/DissData/reddit_text_list_spcy.json','w') as f:
  json.dump(reddit_text_list_spcy, f)

with open('/content/drive/MyDrive/DissData/gab_class_list.json','w') as f:
  json.dump(gab_class_list, f)

with open('/content/drive/MyDrive/DissData/gab_text_list_spcy.json','w') as f:
  json.dump(gab_text_list_spcy, f)

with open('/content/drive/MyDrive/DissData/wikipedia_class_list.json','w') as f:
  json.dump(wikipedia_class_list, f)

with open('/content/drive/MyDrive/DissData/wikipedia_text_list_spcy.json','w') as f:
  json.dump(wikipedia_text_list_spcy, f)

#Few shot sets

#Reddit sets

with open('/content/drive/MyDrive/DissData/fs_reddit_class_list.json','w') as f:
  json.dump(fs_reddit_class_list, f)

with open('/content/drive/MyDrive/DissData/fs_reddit_text_list.json','w') as f:
  json.dump(fs_reddit_text_list, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_reddit_labels.json','w') as f:
  json.dump(hatexplain_fs_reddit_labels, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_reddit_text.json','w') as f:
  json.dump(hatexplain_fs_reddit_text, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_reddit_labels.json','w') as f:
  json.dump(davidson_fs_reddit_labels, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_reddit_text.json','w') as f:
  json.dump(davidson_fs_reddit_text, f)


#Gab sets

with open('/content/drive/MyDrive/DissData/fs_gab_class_list.json','w') as f:
  json.dump(fs_gab_class_list, f)

with open('/content/drive/MyDrive/DissData/fs_gab_text_list.json','w') as f:
  json.dump(fs_gab_text_list, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_gab_labels.json','w') as f:
  json.dump(hatexplain_fs_gab_labels, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_gab_text.json','w') as f:
  json.dump(hatexplain_fs_gab_text, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_gab_labels.json','w') as f:
  json.dump(davidson_fs_gab_labels, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_gab_text.json','w') as f:
  json.dump(davidson_fs_gab_text, f)


#Wikipedia sets

with open('/content/drive/MyDrive/DissData/fs_wikipedia_class_list.json','w') as f:
  json.dump(fs_wikipedia_class_list, f)

with open('/content/drive/MyDrive/DissData/fs_wikipedia_text_list.json','w') as f:
  json.dump(fs_wikipedia_text_list, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_wikipedia_labels.json','w') as f:
  json.dump(hatexplain_fs_wikipedia_labels, f)

with open('/content/drive/MyDrive/DissData/hatexplain_fs_wikipedia_text.json','w') as f:
  json.dump(hatexplain_fs_wikipedia_text, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_wikipedia_labels.json','w') as f:
  json.dump(davidson_fs_wikipedia_labels, f)

with open('/content/drive/MyDrive/DissData/davidson_fs_wikipedia_text.json','w') as f:
  json.dump(davidson_fs_wikipedia_text, f)
