# Hatefulness and Toxicity Analysis

This notebook is meant to create a pipeline for toxicity and hatefulness analysis, specifically for reddit data. As reddit data tends to be more toxic and hateful than other social media pages, we will focus this notebook on analysing existing Singaporean subreddit data provided in class.

The aim of this notebook is to provide a formmat for users to follow to recreate the results we had, and also to follow our methodology of analysis.

## SETUP

In [None]:
# Standard Library Imports
import ast
import datetime
import html
import io
import json
import math
import os
import random
import re
import string
import time
from collections import Counter

import base64
import dash
import dash_bootstrap_components as dbc
from dash import Dash, dcc, html, dash_table, Input, Output, State, callback
from dash.dash_table.Format import Group
from dash.dependencies import Input, Output, State
from dash_bootstrap_templates import load_figure_template

# Gensim Imports
import gensim
import gensim.corpora as corpora
import gensim.utils as gu
import ldamallet

# Hugging Face & Transformer Imports
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModel

# Matplotlib Imports
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter, MaxNLocator

# NLTK Imports
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Plotly Imports
import plotly.express as px
import plotly.graph_objects as go
import plotly.graph_objs as go  # Duplicate alias but keeping it here if both are required
from plotly.subplots import make_subplots

# Pandas and Numpy Imports
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Scipy and Statsmodels Imports
from scipy.stats import f_oneway  # ANOVA test
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Scikit-Learn Imports
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve

# Visualization Imports
import seaborn as sns
from wordcloud import WordCloud

# Text Analysis Imports
import emoji
from langdetect import detect
from textblob import TextBlob

# Torch Imports (for models on local system or device)
import torch

# ONNX Runtime (for deploying models using ONNX)
import onnxruntime as rt

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


## Data Preprocessing:

In [None]:
#Read in the data (STORE FILES IN SUBFOLDER CALLED DATA)
data2020_df = pd.read_csv('./data/Reddit-Threads_2020-2021.csv',  lineterminator='\n', encoding='utf8')
data2022_df = pd.read_csv('./data/Reddit-Threads_2022-2023.csv', lineterminator='\n', encoding='utf8')
print(len(data2020_df)) #2663782
print(len(data2022_df)) #1840541

In [None]:
def clean_data(df):
  pd.options.mode.copy_on_write = True

  #Remove rows with empty review_text
  df = df[df['text'].notnull()]

  #Remove emoji rows
  df['text'] = df['text'].apply(lambda x: emoji.replace_emoji(x,''))

  #Remove punctuation
  df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

  #Remove all rows that has non ASCII characters
  df = df[df['text'].apply(lambda x: all(ord(c) < 128 for c in x))]

  #Set all to lower case
  df['text'] = df['text'].str.lower()

  df = df.reset_index(drop=True)

  return df

In [None]:
#Get random sample of 500,000
length_2020 = len(data2020_df)
length_2022 = len(data2022_df)
size2020 = int(500000 * (length_2020/(length_2020 + length_2022)))
size2022 = 500000 - size2020

sample_2020 = data2020_df.sample(n=size2020, random_state=42)
sample_2022 = data2022_df.sample(n=size2022, random_state=42)

combined_df = pd.concat([sample_2020, sample_2022], axis=0)
combined_df.reset_index(drop=True, inplace=True)

print(len(combined_df))  #500,000

In [None]:
#Get a glimpse of the data
combined_df.head()

In [None]:
#Clean Data:
cleaned_df = clean_data(combined_df)

print(len(cleaned_df)) #original 439642
print(cleaned_df.describe())

In [None]:
#Save data to csv (Only need to run once)
cleaned_df.to_csv('Reddit_cleaned.csv', index=False)

## Data Setup:

In [None]:
reddit_df = cleaned_df

In [None]:
# convert data to appropriate datatypes
reddit_df['text'] = reddit_df['text'].astype(str)
reddit_df['timestamp'] = pd.to_datetime(reddit_df['timestamp'])
reddit_df['username'] = reddit_df['username'].astype(str)
reddit_df['link'] = reddit_df['link'].astype(str)
reddit_df['link_id'] = reddit_df['link_id'].astype(str)
reddit_df['parent_id'] = reddit_df['parent_id'].astype(str)
reddit_df['id'] = reddit_df['id'].astype(str)
reddit_df['subreddit_id'] = reddit_df['subreddit_id'].astype(str)
reddit_df['moderation\r'] = reddit_df['moderation\r'].tolist()

In [None]:
# Remove \r from column names
reddit_df.columns = reddit_df.columns.str.strip()

# Strip \r and other whitespace characters from a specific column (e.g., 'column_name')
reddit_df['Topic'] = reddit_df['Topic'].str.strip()

reddit_df.head()

In [None]:
# extract date from datetime stamp
reddit_df['timestamp'] = reddit_df['timestamp'].dt.date
reddit_df['timestamp'].head()

## Visualizations:

1. Number of Comments across Time

In [None]:
# count number of comments per day
num_of_comments_per_day_df = reddit_df.groupby('timestamp')['id'].count()

num_of_comments_per_day_df.head()

# plot number of comments against time
plt.figure(figsize=(10,6))
plt.plot(num_of_comments_per_day_df.index, num_of_comments_per_day_df.values)
plt.xlabel('Date')
plt.ylabel('Number of Comments')
plt.title('Number of Comments per Day')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()