# Dataset Collation

This notebook assembles the Latent Hatred dataset into a form through which we can perform analysis. We preprocess and create stratified sampels of the dataset.

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein
import pandas as pd
import re
import json
from fuzzywuzzy import fuzz



In [None]:
df1 = pd.read_csv('implicit_hate_v1_stg1.tsv', delimiter='\t')
df2 = pd.read_csv('implicit_hate_v1_stg1_posts.tsv', delimiter='\t')

result_df = pd.concat([df1[['ID', 'class']], df2['post']], axis=1)

df3 = pd.read_csv('implicit_hate_v1_stg2.tsv', delimiter='\t')
df4 = pd.read_csv('implicit_hate_v1_stg3.tsv', delimiter='\t')

result = pd.merge(result_df, df3, on='ID', how='left')

# Grouping df2 by 'ID' and aggregating 'Value' into lists
df4_grouped = df4.groupby('ID')['target'].apply(list).reset_index()

# Merging the dataframes
df_merged = pd.merge(result, df4_grouped, on='ID', how='left')

# Dropping extra implicit class
df_merged = df_merged.drop(columns=['extra_implicit_class'])

In [None]:
df_merged.head()

Unnamed: 0,ID,class,post,implicit_class,target
0,399886440588247041,implicit_hate,""" : jewish harvard professor noel ignatiev w...",white_grievance,"[Jews, Jewish people, Jewish people., Jews, Je..."
1,929901925100937216,not_hate,b.higher education is a part of european cult...,,
2,728678509497954304,not_hate,"has a problem with "" the whites "" "" and "" "" ...",,
3,625688620444180481,not_hate,is yasir qadhi a hate preacher for calling ch...,,
4,441089979322597376,not_hate,"rt "" : how three million germans mass murder...",,


In [None]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def fuzzy_deduplicate(items, threshold=80):
    if isinstance(items, list):
        items = [preprocess(str(item)) for item in items if pd.notna(item)]
    else:
        return items  # Return as is if it's not a list (e.g., NaN)

    unique_items = []

    for item in items:
        if not any(fuzz.ratio(item, unique_item) >= threshold for unique_item in unique_items):
            unique_items.append(item)

    return unique_items

processed_df = df_merged.copy()
processed_df['post'] = processed_df['post'].apply(preprocess)
processed_df['target_new'] = processed_df['target'].apply(lambda x: [] if isinstance(x, float) and pd.isna(x) else fuzzy_deduplicate(x, 80))
processed_df['binary_class'] = processed_df['class'].apply(lambda x: 'hate' if x in ['explicit_hate', 'implicit_hate'] else 'not_hate')

processed_df.head()

Unnamed: 0,ID,class,post,implicit_class,target,target_new,binary_class
0,399886440588247041,implicit_hate,jewish harvard professor noel ignatiev wants t...,white_grievance,"[Jews, Jewish people, Jewish people., Jews, Je...","[jews, jewish people]",hate
1,929901925100937216,not_hate,bhigher education is a part of european cultur...,,,[],not_hate
2,728678509497954304,not_hate,has a problem with the whites and the christia...,,,[],not_hate
3,625688620444180481,not_hate,is yasir qadhi a hate preacher for calling chr...,,,[],not_hate
4,441089979322597376,not_hate,rt how three million germans mass murdered aft...,,,[],not_hate


In [None]:
processed_df.value_counts('binary_class')

Unnamed: 0_level_0,count
binary_class,Unnamed: 1_level_1
not_hate,13291
hate,8189


In [None]:
df = processed_df.copy()

# List of profanity words/patterns (you can add more to this list)
profanity_list = [
    'f\*ck', 'f@ck', 'f!ck', 'fu\*k', 'fu@k', 'fu!k', 'fuc\*', 'fuc@', 'fuc!',
    'b\*tch', 'b!tch', 'b@tch', 'b1tch', 'bi\*ch', 'bi!ch', 'bi@ch',
    'a\*shole', 'a@shole', 'a!shole', 'a\*s', 'a@ss', 'a!ss',
    'c\*nt', 'c@nt', 'c!nt',
    'sh\*t', 'sh!t', 'sh@t', 's\*it', 's!it', 's@it',
    'd\*ck', 'd!ck', 'd@ck', 'd1ck',
    'pu\*sy', 'pu@sy', 'pu!sy', 'puss\*', 'puss@', 'puss!',
    'n\*gger', 'n!gger', 'n@gger', 'n1gger',
    'n\*gga', 'n!gga', 'n@gga', 'n1gga',
    'c\*cksucker', 'c@cksucker', 'c!cksucker',
    'motherf\*cker', 'motherf@cker', 'motherf!cker', 'motherfu\*ker',
    'c\*ck', 'c@ck', 'c!ck',
    'wh\*re', 'wh@re', 'wh!re',
    'sl\*t', 'sl@t', 'sl!t',
    'fag\*ot', 'fag@ot', 'fag!ot',
    'dumb\*ss', 'dumb@ss', 'dumb!ss',
    'b\*stard', 'b@stard', 'b!stard',
    'tw\*t', 'tw!t', 'tw@t',
    'pr\*ck', 'pr!ck', 'pr@ck'
]

# Combine the list into a single regular expression pattern
profanity_pattern = '|'.join(profanity_list)

# Filter out rows containing any profane words
df_clean = df[~df['post'].str.contains(profanity_pattern, flags=re.IGNORECASE, regex=True)]

In [None]:
df_clean.head()

Unnamed: 0,ID,class,post,implicit_class,target,target_new,binary_class
0,399886440588247041,implicit_hate,jewish harvard professor noel ignatiev wants t...,white_grievance,"[Jews, Jewish people, Jewish people., Jews, Je...","[jews, jewish people]",hate
1,929901925100937216,not_hate,bhigher education is a part of european cultur...,,,[],not_hate
2,728678509497954304,not_hate,has a problem with the whites and the christia...,,,[],not_hate
3,625688620444180481,not_hate,is yasir qadhi a hate preacher for calling chr...,,,[],not_hate
4,441089979322597376,not_hate,rt how three million germans mass murdered aft...,,,[],not_hate


In [None]:
n_samples = 500 # n_samples*2 for not_hate.

not_hate_df = df_clean[df_clean['binary_class'] == 'not_hate'].sample(n_samples*2, random_state=42)
implicit_hate_df = df_clean[df_clean['class'] == 'implicit_hate'].sample(n_samples, random_state=42)
explicit_hate_df = df_clean[df_clean['class'] == 'explicit_hate'].sample(n_samples, random_state=42)

full_df = pd.concat([not_hate_df, implicit_hate_df, explicit_hate_df], ignore_index=True)

In [None]:
full_df.to_csv(f'processed_full_{n_samples}.tsv', sep='\t')

In [None]:
# full_df = pd.read_csv(f'processed_full_{n_samples}.tsv', sep='\t')

In [None]:
full_df.head()

Unnamed: 0,ID,class,post,implicit_class,target,target_new,binary_class
0,893818124356726784,not_hate,the new centry foundatin is a white supremacis...,,,[],not_hate
1,810629155188539392,not_hate,i dont want to be controversial but im white 12,,,[],not_hate
2,5393720593,not_hate,afire stands for americans for immigration ref...,,,[],not_hate
3,915685988483108864,not_hate,its 75 white which is very high when the 22 po...,,,[],not_hate
4,752622004885389312,not_hate,obama claims he doesnt know the motive of a se...,,,[],not_hate


In [None]:
full_df.value_counts('binary_class')

Unnamed: 0_level_0,count
binary_class,Unnamed: 1_level_1
hate,1000
not_hate,1000
