In [40]:
import numpy as np
import pandas as pd
import re
import emoji

df = pd.read_csv('./datasets/english/davidson.csv')

In [41]:
df = df[['tweet', 'class']]

df = df.rename(columns={'tweet': 'text', 'class': 'label'})


label_map = {0: 'hate', 1: 'offensive', 2: 'normal'}
df['label'] = df['label'].map(label_map)


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'@\w+', '', text) #Removes @mentions, like @elonmusk
    text = re.sub(r'http\S+|www\S+', '', text) #Removes URLs, e.g. https://example.com or www.google.com
    text = re.sub(r'[^a-z\s]', '', text) #Removes everything except alphabets and spaces
    text = re.sub(r'\s+', ' ', text).strip() #Collapses multiple spaces into one
    text = emoji.replace_emoji(text, replace='') #Removes emojis explicitly, using the emoji library
    return text


df['clean_text'] = df['text'].apply(clean_text)


df = df[df['clean_text'].str.strip() != '']


df.to_csv("./datasets/english_preprocessed/davidson_preprocessed.csv", index=False)

df.head()

Unnamed: 0,text,label,clean_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,normal,rt as a woman you shouldnt complain about clea...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,offensive,rt boy dats coldtyga dwn bad for cuffin dat ho...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,offensive,rt dawg rt you ever fuck a bitch and she start...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,offensive,rt she look like a tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,offensive,rt the shit you hear about me might be true or...


In [42]:
print(df.shape)
print(df['label'].value_counts())

(24781, 3)
label
offensive    19189
normal        4162
hate          1430
Name: count, dtype: int64


In [None]:
import json
import pandas as pd
import re, emoji
from collections import Counter


with open("./datasets/english/hatexplaindataset.json", 'r') as f:
    data = json.load(f)

records = []
for post_id, post_data in data.items():
    tokens = post_data.get('post_tokens', [])
    text = " ".join(tokens)


    labels = [ann.get('label') for ann in post_data.get('annotators', []) if 'label' in ann]
    label = Counter(labels).most_common(1)[0][0] if labels else None
    records.append({
        "text": text,
        "label": label
    })

df = pd.DataFrame(records)
print("before dropping null")
print(df.shape)

df = df.dropna(subset=['label'])
print("after dropping null")
print(df.shape)

label_map = {
    'hatespeech': 'hate',
    'offensive': 'offensive',
    'normal': 'normal'
}
df['label'] = df['label'].map(label_map)


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = emoji.replace_emoji(text, replace='')
    return text

df['clean_text'] = df['text'].apply(clean_text)
df = df[df['clean_text'].str.strip() != '']


df = df[['text', 'label', 'clean_text']]

before dropping null
(20148, 2)
after dropping null
(20148, 2)


In [44]:
df.to_csv("./datasets/english_preprocessed/english_hatexplain_preprocessed.csv", index=False)
print(df.head())

                                                text   label  \
0  i dont think im getting my baby them white 9 h...  normal   
1  we cannot continue calling ourselves feminists...  normal   
2                      nawt yall niggers ignoring me  normal   
3  <user> i am bit confused coz chinese ppl can n...    hate   
4  this bitch in whataburger eating a burger with...    hate   

                                          clean_text  
0  i dont think im getting my baby them white he ...  
1  we cannot continue calling ourselves feminists...  
2                      nawt yall niggers ignoring me  
3  user i am bit confused coz chinese ppl can not...  
4  this bitch in whataburger eating a burger with...  


In [45]:
print(df.shape)
print(df['label'].value_counts())

(20148, 3)
label
normal       8153
hate         6234
offensive    5761
Name: count, dtype: int64


In [46]:
import pandas as pd
import re, emoji


df = pd.read_csv("./datasets/english/Dynamically_Generated_Hate_Dataset_v0.2.3.csv")
print(df.shape)
print(df['label'].value_counts())


df = df[['text', 'label']].copy()


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)


label_map = {'hate': 'hate', 'nothate': 'normal'}
df['label'] = df['label'].str.lower().map(label_map)


df = df.dropna(subset=['label', 'clean_text'])
df = df[df['clean_text'].str.strip() != '']


df = df[['text', 'label', 'clean_text']]


df.to_csv("./datasets/english_preprocessed/Preprocessed_Dynamically_Generated_Hate_Dataset_v0.csv", index=False)

print(df.head())


(41144, 13)
label
hate       22175
nothate    18969
Name: count, dtype: int64
                                                text label  \
0                                dalits are lowlives  hate   
1             gay people are a burden to our society  hate   
2                              Arabs are not welcome  hate   
3  I'm not saying we should actually eliminate he...  hate   
4                       bananas are for black people  hate   

                                          clean_text  
0                                dalits are lowlives  
1             gay people are a burden to our society  
2                              arabs are not welcome  
3  im not saying we should actually eliminate hee...  
4                       bananas are for black people  


In [None]:

df_davidson = pd.read_csv("./datasets/english_preprocessed/davidson_preprocessed.csv")
df_hatexplain = pd.read_csv("./datasets/english_preprocessed/english_hatexplain_preprocessed.csv")
df_dynamic = pd.read_csv("./datasets/english_preprocessed/Preprocessed_Dynamically_Generated_Hate_Dataset_v0.csv")

print("Davidson dataset shape:", df_davidson.shape)
print("HateXplain dataset shape:", df_hatexplain.shape)
print("Dynamic dataset shape:", df_dynamic.shape)

all_english_data = pd.concat([df_davidson, df_hatexplain, df_dynamic], ignore_index=True)

print("\n" + "="*50)
print("Merged Dataset Info:")
print("="*50)
print(f"Total shape: {all_english_data.shape}")
print(f"\nLabel distribution:")
print(all_english_data['label'].value_counts())
print(f"\nLabel percentages:")
print(all_english_data['label'].value_counts(normalize=True) * 100)

all_english_data.to_csv("./datasets/english_preprocessed/all_english_data.csv", index=False)
print(f"\n✅ Merged dataset saved as 'all_english_data.csv'")
print(f"\nFirst few rows:")
print(all_english_data.head())


Davidson dataset shape: (24781, 3)
HateXplain dataset shape: (20148, 3)
Dynamic dataset shape: (41144, 3)

Merged Dataset Info:
Total shape: (86073, 3)

Label distribution:
label
normal       31284
hate         29839
offensive    24950
Name: count, dtype: int64

Label percentages:
label
normal       36.345892
hate         34.667085
offensive    28.987023
Name: proportion, dtype: float64

✅ Merged dataset saved as 'all_english_data.csv'

First few rows:
                                                text      label  \
0  !!! RT @mayasolovely: As a woman you shouldn't...     normal   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  offensive   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  offensive   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  offensive   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  offensive   

                                          clean_text  
0  rt as a woman you shouldnt complain about clea...  
1  rt boy dats coldtyga dwn ba