# Hatefulness and Toxicity Analysis

This notebook is meant to create a pipeline for toxicity and hatefulness analysis, specifically for reddit data. As reddit data tends to be more toxic and hateful than other social media pages, we will focus this notebook on analysing existing Singaporean subreddit data provided in class.

The aim of this notebook is to provide a formmat for users to follow to recreate the results we had, and also to follow our methodology of analysis.

## SETUP

In [1]:
# Standard Library Imports
import ast
import datetime
import html
import io
import json
import math
import os
import random
import re
import string
import time
from collections import Counter

import base64
import dash
import dash_bootstrap_components as dbc
from dash import Dash, dcc, html, dash_table, Input, Output, State, callback
from dash.dash_table.Format import Group
from dash.dependencies import Input, Output, State
from dash_bootstrap_templates import load_figure_template

# Gensim Imports
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import gensim.utils as gu
import ldamallet

# Hugging Face & Transformer Imports
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModel

# Matplotlib Imports
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter, MaxNLocator

# NLTK Imports
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Plotly Imports
import plotly.express as px
import plotly.graph_objects as go
import plotly.graph_objs as go  # Duplicate alias but keeping it here if both are required
from plotly.subplots import make_subplots

# Pandas and Numpy Imports
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Scipy and Statsmodels Imports
from scipy.stats import f_oneway  # ANOVA test
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Scikit-Learn Imports
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve

# Visualization Imports
import seaborn as sns
from wordcloud import WordCloud

# Text Analysis Imports
import emoji
from langdetect import detect
from textblob import TextBlob

# Torch Imports (for models on local system or device)
import torch

# ONNX Runtime (for deploying models using ONNX)
import onnxruntime as rt

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rhyde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rhyde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rhyde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rhyde\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Data Preprocessing:

In [None]:
#Read in the data (STORE FILES IN SUBFOLDER CALLED DATA. REname file path accordingly)
data2020_df = pd.read_csv('./data/Reddit-Threads_2020-2021.csv',  lineterminator='\n', encoding='utf8')
data2022_df = pd.read_csv('./data/Reddit-Threads_2022-2023.csv', lineterminator='\n', encoding='utf8')
print(len(data2020_df)) #2663782
print(len(data2022_df)) #1840541

In [None]:
def clean_data(df):
  pd.options.mode.copy_on_write = True

  #Remove rows with empty review_text
  df = df[df['text'].notnull()]

  #Remove emoji rows
  df['text'] = df['text'].apply(lambda x: emoji.replace_emoji(x,''))

  #Remove punctuation
  df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

  #Remove all rows that has non ASCII characters
  df = df[df['text'].apply(lambda x: all(ord(c) < 128 for c in x))]

  #Set all to lower case
  df['text'] = df['text'].str.lower()

  df = df.reset_index(drop=True)

  return df

In [None]:
#Get random sample of 500,000
length_2020 = len(data2020_df)
length_2022 = len(data2022_df)
size2020 = int(500000 * (length_2020/(length_2020 + length_2022)))
size2022 = 500000 - size2020

sample_2020 = data2020_df.sample(n=size2020, random_state=42)
sample_2022 = data2022_df.sample(n=size2022, random_state=42)

combined_df = pd.concat([sample_2020, sample_2022], axis=0)
combined_df.reset_index(drop=True, inplace=True)

print(len(combined_df))  #500,000

In [None]:
#Get a glimpse of the data
combined_df.head()

In [None]:
#Clean Data:
cleaned_df = clean_data(combined_df)

print(len(cleaned_df)) #original 439642
print(cleaned_df.describe())

In [None]:
#Save data to csv (Only need to run once)
cleaned_df.to_csv('Reddit_cleaned.csv', index=False)

## Data Setup:

In [None]:
reddit_df = cleaned_df

In [None]:
# convert data to appropriate datatypes
reddit_df['text'] = reddit_df['text'].astype(str)
reddit_df['timestamp'] = pd.to_datetime(reddit_df['timestamp'])
reddit_df['username'] = reddit_df['username'].astype(str)
reddit_df['link'] = reddit_df['link'].astype(str)
reddit_df['link_id'] = reddit_df['link_id'].astype(str)
reddit_df['parent_id'] = reddit_df['parent_id'].astype(str)
reddit_df['id'] = reddit_df['id'].astype(str)
reddit_df['subreddit_id'] = reddit_df['subreddit_id'].astype(str)
reddit_df['moderation\r'] = reddit_df['moderation\r'].tolist()

In [None]:
# Remove \r from column names
reddit_df.columns = reddit_df.columns.str.strip()

# Strip \r and other whitespace characters from a specific column (e.g., 'column_name')
reddit_df['Topic'] = reddit_df['Topic'].str.strip()

reddit_df.head()

In [None]:
# extract date from datetime stamp
reddit_df['timestamp'] = reddit_df['timestamp'].dt.date
reddit_df['timestamp'].head()

## Sentiment Scoring - LionGuard

We split this into 1 small test batch, and 1 larger one for running the full dataset.

### Small Batch

In [None]:
small_reddit_df = reddit_df[0:5]

small_reddit_df

In [None]:
# Download model config
repo_path = "govtech/lionguard-v1"
config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
with open(config_path, 'r') as f:
    config = json.load(f)

In [None]:
def get_embeddings(device, data):
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config['embedding']['tokenizer'])
    model = AutoModel.from_pretrained(config['embedding']['model'])
    model.eval()
    model.to(device)

    # Generate the embeddings
    batch_size = config['embedding']['batch_size']
    num_batches = int(np.ceil(len(data)/batch_size))
    output = []
    for i in range(num_batches):
        sentences = data[i*batch_size:(i+1)*batch_size]
        encoded_input = tokenizer(sentences, max_length=config['embedding']['max_length'], padding=True, truncation=True, return_tensors='pt')
        encoded_input.to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)
            sentence_embeddings = model_output[0][:, 0]
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        output.extend(sentence_embeddings.cpu().numpy())
    
    return np.array(output)

In [None]:
def predict2(batch_text):
    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    embeddings = get_embeddings(device, batch_text)
    embeddings_df = pd.DataFrame(embeddings)

    # Prepare input data
    X_input = np.array(embeddings_df, dtype=np.float32)

    # Define the classifiers we want to focus on
    selected_categories = ['hateful', 'toxic']  # Only focus on 'hateful' and 'toxic'

    # Load the classifiers
    results = {}
    for category in selected_categories:  # Only loop over selected_categories
        # Download the classifier from HuggingFace hub
        local_model_fp = hf_hub_download(repo_id=repo_path, filename=config['classifier'][category]['model_fp'])

        # Run the inference
        session = rt.InferenceSession(local_model_fp)
        input_name = session.get_inputs()[0].name
        outputs = session.run(None, {input_name: X_input})

        # If calibrated, return only the prediction for the unsafe class
        if config['classifier'][category]['calibrated']:
            scores = [output[1] for output in outputs[1]]
        else:
            scores = outputs[1].flatten()

        # Generate the predictions depending on the recommended threshold score
        results[f'{category} Score'] = {  # Directly access 'hateful' and 'toxic' scores
            'scores': scores,
            'predictions': {
                'high_recall': [1 if score >= config['classifier'][category]['threshold']['high_recall'] else 0 for score in scores]
            }
        }
        results[f'{category} HR'] = results[f'{category} Score']['predictions']['high_recall']  # CHANGE 4: Only high_recall predictions

    return results

In [None]:
def predict(batch_text):
    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    embeddings = get_embeddings(device, batch_text)
    embeddings_df = pd.DataFrame(embeddings)

    # Prepare input data
    X_input = np.array(embeddings_df, dtype=np.float32)

    # Load the classifiers
    results = {}
    for category, details in config['classifier'].items():
        # Download the classifier from HuggingFace hub
        local_model_fp = hf_hub_download(repo_id=repo_path, filename=config['classifier'][category]['model_fp'])

        # Run the inference
        session = rt.InferenceSession(local_model_fp)
        input_name = session.get_inputs()[0].name
        outputs = session.run(None, {input_name: X_input})

        # If calibrated, return only the prediction for the unsafe class
        if config['classifier'][category]['calibrated']: 
            scores = [output[1] for output in outputs[1]]
        else:
            scores = outputs[1].flatten()
        
        # Generate the predictions depending on the recommended threshold score
        results[category] = {
            'scores': scores,
            'predictions': {
                'high_recall': [1 if score >= config['classifier'][category]['threshold']['high_recall'] else 0 for score in scores],
                'balanced': [1 if score >= config['classifier'][category]['threshold']['balanced'] else 0 for score in scores],
                'high_precision': [1 if score >= config['classifier'][category]['threshold']['high_precision'] else 0 for score in scores]
            }
        }

    return results

In [None]:
# Extract the text data and id from the DataFrame
batch_id = small_reddit_df['id'].tolist()
batch_text = small_reddit_df['text'].tolist()

# Generate the scores and predictions
results = predict(batch_text)

# Prepare results for DataFrame
output_data = []
for i in range(len(batch_text)):
    output_row = {
        'id': batch_id[i],
        'Text': batch_text[i],
    }
    # IMPT! THIS LOOP WILL PRODUCE 32 COLUMNS! COMMENT OUT IF NOT NEEDED!
    for category in results.keys():
        # scores
        output_row[f'{category} Score'] = results[category]['scores'][i]
        # predictions with highest recall
        output_row[f'{category} HR'] = results[category]['predictions']['high_recall'][i]
        # balanced predictions
        output_row[f'{category} B'] = results[category]['predictions']['balanced'][i]
        # predictions with highest precision
        output_row[f'{category} HP'] = results[category]['predictions']['high_precision'][i]
    output_data.append(output_row)

# Create a DataFrame from the results
small_results_df = pd.DataFrame(output_data)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# get results table
print(small_results_df)

In [None]:
# See column names
print(small_results_df.columns)

# get id, hateful and toxic scores only
condensed_small_results_df = small_results_df[['id', 'hateful Score', 'toxic Score']]

condensed_small_results_df

In [None]:
# merge the 2 dataframes on 'id'
small_hateful_and_toxic_results_df = pd.merge(small_reddit_df, condensed_small_results_df, on='id', how='inner')

print(small_hateful_and_toxic_results_df)

In [None]:
# expand resolution to see full text
pd.set_option('display.max_colwidth', None)

small_hateful_and_toxic_results_df[['text', 'hateful Score', 'toxic Score']]

### Running Lion Guard for a full dataset:

Note: It can take up to 6 - 10 hours to run this on google colab's free GPU. Best to run on local GPU device.

In [None]:
reddit_df.shape

In [None]:
# Ensure text is clean and all entries are strings
reddit_df['text'] = reddit_df['text'].fillna('').astype(str)
batch_text = reddit_df['text'].tolist()
batch_id = reddit_df['id'].tolist()

In [None]:
# Generate the scores and predictions
results = predict2(batch_text)

# Prepare results for DataFrame
output_data = []
for i in range(len(batch_text)):
    output_row = {
        'id': batch_id[i],
        'Text': batch_text[i],
    }

    # Directly add 'hateful Score', 'hateful HR', 'toxic Score', and 'toxic HR' to the output
    output_row['hateful Score'] = results['hateful Score']['scores'][i]  
    output_row['hateful HR'] = results['hateful HR'][i]  
    output_row['toxic Score'] = results['toxic Score']['scores'][i]  
    output_row['toxic HR'] = results['toxic HR'][i] 

    output_data.append(output_row)

# Create a DataFrame from the results
results_df = pd.DataFrame(output_data)

In [None]:
# See column names
print(results_df.columns)

In [None]:
# get id, hateful and toxic scores only
condensed_results_df = results_df[['id', 'hateful Score', 'hateful HR',  'toxic Score', 'toxic HR']]

condensed_results_df

In [None]:
# merge the 2 dataframes on 'id'
hateful_and_toxic_results_df = pd.merge(reddit_df, condensed_results_df, on='id', how='inner')

hateful_and_toxic_results_df

In [None]:
# Check number of rows
hateful_and_toxic_results_df.shape

## Topic Modelling:

### Unsupervised

In [None]:
clean_data = reddit_df
df = clean_data.dropna()

In [None]:
# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from a single text
def remove_stopwords(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the remaining words back into a single string
    return ' '.join(filtered_words)

# Apply the function to the 'comments' column
df['cleaned_comments'] = df['text'].apply(remove_stopwords)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
data_vectorized = vectorizer.fit_transform(df['cleaned_comments'])

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(data_vectorized)

In [None]:
# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Number of words to display per topic
no_top_words = 10

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Display the topics
display_topics(lda, feature_names, no_top_words)

### Seeded LDA

In [None]:
def load_data(path):
    reddit_df = pd.read_csv(path, lineterminator='\n', encoding='utf8')
    return reddit_df

In [None]:
def preprocess_gensim(text):
    """Tokenizes and processes the text using Gensim."""
    if isinstance(text, str):
        return ' '.join(gu.simple_preprocess(text))
    else:
        return ''  # Return an empty string for non-string inputs

In [None]:
def preprocessing(df):
    df['clean_text'] = df['text'].str.lower()
    print("cleaned_lower")
    df['clean_text'] = df['clean_text'].str.replace(r'[^a-zA-Z\s]', ' ',regex=True) 
    df['clean_text'] = df['clean_text'].str.replace(r'\s{2,}', ' ',regex=True)   
    print("cleaned_regex") 
    df['clean_text'] = df['clean_text'].apply(preprocess_gensim)
    print("cleaned_preprocessed")
    df['clean_text'] = df['clean_text'].apply(word_tokenize)
    print("cleaned_tokenized")
    df['clean_text'] = df['clean_text'].apply(lambda x:[word for word in x if word not in stopwords.words("english") and word.isalpha()])
    print("cleaned_stopwords")
    df['clean_text'] = df['clean_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
    print("cleaned_Lemmatized")
    df['clean_text'] = df['clean_text'].apply(lambda x: [word for word in x if nltk.pos_tag([word])[0][1] == 'NN'])
    print("cleaned_tagged")
    df = df[df['clean_text'].map(lambda x: len(x)) > 1].reset_index(drop=True)
    return df

In [None]:
def create_dictionary(reddit_df):
    texts = reddit_df['clean_text']
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    return texts, id2word, corpus

In [None]:
def load_mallet(system,folder_path):
    os.environ['MALLET_HOME']=folder_path
    if system == 'windows': mallet_path = folder_path+"\\bin\\mallet.bat"
    elif system == 'mac': mallet_path = folder_path+"/bin/mallet"
    return mallet_path

In [None]:
# Define the function for topic modeling
def topic_modelling(model, corpus, texts, data, seed_topics):
    output_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        print(f"Document {i}, Topics: {row}")

        # If the text is empty, classify it as 'Others'
        if not texts[i]:
            output_df = pd.concat([
                output_df,
                pd.DataFrame([[10, 'Others', 1.000, '']], columns=['Topic Number', 'Topic', 'Perc_Contribution', 'Topic_Keywords'])
            ], ignore_index=True)
        else:
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # Dominant topic (highest contribution)
                    wp = model.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])

                    # Map the topic number to a predefined topic using seed_topics
                    topic_name = seed_topics.get(int(topic_num), 'Unknown')

                    output_df = pd.concat([
                        output_df,
                        pd.DataFrame([[int(topic_num), topic_name, round(prop_topic, 4), topic_keywords]], 
                                      columns=['Topic Number', 'Topic', 'Perc_Contribution', 'Topic_Keywords'])
                    ], ignore_index=True)
                else:
                    break

    # Concatenate the original data with the topic modeling results
    output_df = pd.concat([data, output_df], axis=1)

    # Remove any unnecessary columns such as 'clean_text' if needed
    output_df = output_df.drop(['clean_text', 'Perc_Contribution', 'Topic_Keywords'], axis=1, errors='ignore')

    return output_df

In [None]:
# Define your seed topics
seed_topics = {
    0: "Political",
    1: "Covid-19",
    2: "Race & Religion",
    3: "Transport",
    4: "Relationships",
    5: "Crime",
    6: "Housing",
    7: "Education",
    8: "Work"
}

# Define the seed words for each topic
seed_words = {
    "Political": ["ge", "general election", "affair", "mp", "politician", "politics"],
    "Covid-19": ["covid-19", "infection", "vaccine", "lockdown", "circuit breaker", "mask", "cough"],
    "Race & Religion": ["chinese", "malay", "indian", "angmoh", "culture", "christian", "buddhist", "muslim", "racist", "CECA"],
    "Transport": ["breakdown", "train", "mrt", "lrt", "bus", "simplygo"],
    "Relationships": ["relationships", "husband", "wife", "bf", "gf", "breakup", "cheat", "affair", "lover", "divorce", "love"],
    "Crime": ["crime", "case", "police", "murder", "kill", "death", "scam"],
    "Housing": ["hdb", "price", "bto", "resale"],
    "Education": ["student", "psle", "study", "alevel", "olevel", "exam", "school"],
    "Work": ["ot", "salary", "unemployed", "boss", "job", "laoban", "colleague"]
}

In [None]:
reddit_df = hateful_and_toxic_results_df
reddit_df_processed = preprocessing(reddit_df) # Can take up to a few hours to run

In [None]:
texts, id2word, corpus = create_dictionary(reddit_df_processed)

- Download MALLET from here: https://mallet.cs.umass.edu/download.php

In [None]:
mallet_path = r" " # Insert Path to Mallet here

os.environ['MALLET_HOME'] = " " # Eg. r'C:\Users\mallet-2.0.8\mallet-2.0.8'
os.environ['PATH'] = os.environ['PATH'] + os.pathsep + " " # Eg r'C:\Users\mallet-2.0.8\mallet-2.0.8\bin'

In [None]:
def create_mallet(mallet_path, num_topics, id2word, corpus):
    # Use Gensim's wrapper for MALLET
    return ldamallet.LdaMallet(
        mallet_path=mallet_path, 
        corpus=corpus, 
        num_topics=num_topics, 
        id2word=id2word
    )

In [None]:
mallet = create_mallet(mallet_path=mallet_path, num_topics=10, id2word=id2word, corpus=corpus)

In [None]:
output_df = topic_modelling(model=mallet,corpus=corpus,texts=texts,data=reddit_df, seed_topics = seed_topics)

In [None]:
output_df.to_csv('topic_model_results.csv', index=False)
output_df.head

In [None]:
# Coherence Scores:

## c_v method
# Create the CoherenceModel for the Mallet model
coherence_model_mallet = CoherenceModel(model=mallet, texts=texts, dictionary=id2word, coherence='c_v')
# Compute the coherence score
coherence_score = coherence_model_mallet.get_coherence()
print(f'Coherence Score: {coherence_score}')


## u_mass method
# Assuming texts is a list of tokenized texts
corpus = [id2word.doc2bow(text) for text in texts]
seeded_topics = list(seed_words.values())
# Create the CoherenceModel with corpus and dictionary for 'u_mass'
cm = CoherenceModel(topics=seeded_topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value
coherence

## c_uci method
cm = CoherenceModel(topics=seeded_topics, texts=texts, dictionary=id2word, coherence='c_uci')
cm.get_coherence()

## Visualizations:

### Daily Average Hatefulness and Toxicity Scores

In [None]:
results_df = pd.read_csv('../../data/topic_model_results.csv',  lineterminator='\n', encoding='utf8')
reddit_df = pd.read_csv('../../data/topic_model_results.csv',  lineterminator='\n', encoding='utf8')

In [None]:
# Remove \r from column names
results_df.columns = results_df.columns.str.strip()

# Strip \r and other whitespace characters from a specific column (e.g., 'column_name')
results_df['Topic'] = results_df['Topic'].str.strip()

results_df.head()

# rename new columns
results_df.rename(columns={
    'hateful Score': 'hateful_score',
    'hateful HR': 'hateful_prediction',
    'toxic Score': 'toxic_score',
    'toxic HR': 'toxic_prediction',
    'Topic Number': 'topic_number',
    'Topic': 'topic'
}, inplace=True)

results_df.head()

results_df['moderation'] = results_df['moderation'].apply(ast.literal_eval)

# Create boolean flags for collapsed, deleted, low-score, removed, and controversial comments
results_df['is_collapsed'] = results_df['moderation'].apply(lambda x: x.get('collapsed') == True)
results_df['is_deleted'] = results_df['moderation'].apply(lambda x: x.get('collapsed_reason_code') == 'DELETED')
results_df['is_controversial'] = results_df['moderation'].apply(lambda x: x.get('controversiality') == 1)

results_df['is_controversial'].head()



In [None]:
# convert data to appropriate datatypes
results_df['text'] = results_df['text'].astype(str)
results_df['timestamp'] = pd.to_datetime(results_df['timestamp'], format='%d/%m/%Y %H:%M')
results_df['username'] = results_df['username'].astype(str)
results_df['link'] = results_df['link'].astype(str)
results_df['link_id'] = results_df['link_id'].astype(str)
results_df['parent_id'] = results_df['parent_id'].astype(str)
results_df['id'] = results_df['id'].astype(str)
results_df['subreddit_id'] = results_df['subreddit_id'].astype(str)
results_df['moderation'] = results_df['moderation'].tolist()
results_df['hateful_score'] = results_df['hateful_score'].astype('float32')
results_df['hateful_prediction'] = results_df['hateful_prediction'].astype('int16')
results_df['toxic_score'] = results_df['toxic_score'].astype('float32')
results_df['toxic_prediction'] = results_df['toxic_prediction'].astype('int16')
results_df['topic_number'] = results_df['topic_number'].astype('category')
results_df['topic'] = results_df['topic'].astype('category')


# Group by year and month to track moderation actions over time
results_df['year_month'] = results_df['timestamp'].dt.to_period('M')

# extract date from datetime stamp
results_df['timestamp'] = results_df['timestamp'].dt.date

results_df['timestamp'].head()


In [None]:
# Group by day and calculate the average score
average_hatefulness_and_toxicity_per_day_df = results_df.groupby(results_df['timestamp']).agg(
    average_hateful_score=('hateful_score', 'mean'), 
    average_toxic_score=('toxic_score', 'mean')
).reset_index()
average_hatefulness_and_toxicity_per_day_df.head()

average_hatefulness_and_toxicity_per_day_df.tail()

In [None]:
# Ensure 'timestamp' is a datetime object
average_hatefulness_and_toxicity_per_day_df['timestamp'] = pd.to_datetime(average_hatefulness_and_toxicity_per_day_df['timestamp'])

average_hatefulness_and_toxicity_per_day_df[average_hatefulness_and_toxicity_per_day_df['timestamp'].dt.date == pd.to_datetime('2023-04-30').date()]

In [None]:
# Plot the first DataFrame
plt.plot(average_hatefulness_and_toxicity_per_day_df['timestamp'], average_hatefulness_and_toxicity_per_day_df['average_hateful_score'], label='Hateful Score', color='red', alpha=0.7)

# Plot the second DataFrame
plt.plot(average_hatefulness_and_toxicity_per_day_df['timestamp'], average_hatefulness_and_toxicity_per_day_df['average_toxic_score'], label='Toxic Score', color='green', alpha=0.7)

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Average Scores')
plt.title('Daily Average Hatefulness and Toxic Scores')
plt.xticks(rotation=45)

# Add a legend to differentiate the lines
plt.legend()

plt.tight_layout()
plt.show()

### Number of Comments per day

In [None]:
# count number of comments per day
num_of_comments_per_day_df = results_df.groupby('timestamp')['id'].count()
num_of_comments_per_day_df.head()

In [None]:
# plot number of comments against time
plt.figure(figsize=(10,6))
plt.plot(num_of_comments_per_day_df.index, num_of_comments_per_day_df.values)
plt.xlabel('Date')
plt.ylabel('Number of Comments')
plt.title('Number of Comments per Day')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Toxicity Score Distribution by subreddit

In [None]:
subreddits_of_interest = results_df['subreddit_name'].unique()
subreddits_of_interest

In [None]:
# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Create a box plot with a custom color palette
plt.figure(figsize=(14, 8))
palette = {'r/singapore': "#4c72b0", 'r/singaporeraw': "#55a868", 'r/singaporehappenings': "#c44e52"}
ax = sns.boxplot(data=results_df, x='subreddit_name', y='toxic_score', order=subreddits_of_interest, palette=palette, whis = 200)

# Annotate median, first quartile (Q1), and third quartile (Q3) for each subreddit
for i, subreddit in enumerate(subreddits_of_interest):
    subreddit_data = results_df[results_df['subreddit_name'] == subreddit]['toxic_score']
    median = subreddit_data.median()
    q1 = subreddit_data.quantile(0.25)
    q3 = subreddit_data.quantile(0.75)
    min_val = subreddit_data.min()
    max_val = subreddit_data.max()
    
    # Median label
    ax.text(i, median, f'Median\n{median:.2f}', ha='center', va='center', color='white', fontweight='bold',
            bbox=dict(facecolor=palette[subreddit], edgecolor='none', boxstyle='round,pad=0.3'))
    # Q1 and Q3 labels
    ax.text(i, q1, f'25th percentile\n{q1:.2f}', ha='right' , va= 'top', color=palette[subreddit], fontsize=13)
    ax.text(i, q3, f'75th percentile\n{q3:.2f}', ha='right', va='bottom', color=palette[subreddit], fontsize=13)
    # Min and Max labels
    ax.text(i, min_val, f'Min\n{min_val:.2f}', ha='center', va='top', color='black', fontsize=12)
    ax.text(i, max_val, f'Max\n{max_val:.2f}', ha='center', va='bottom', color='black', fontsize=12)

# Set title and labels
plt.title('Toxic Score Distribution by Subreddit', fontsize=18, fontweight='bold')
plt.xlabel('Subreddit', fontsize=14)
plt.ylabel('Toxic Score', fontsize=14)

# Improve spacing and add legend for clarity
plt.tight_layout()

# Display the plot
plt.show()

### Average Toxic Score per Month by Subreddit

In [None]:
# Ensure year_month is in datetime format for consistent plotting
results_df['year_month'] = pd.to_datetime(results_df['year_month'].astype(str))

# Aggregate to calculate average toxic_score per subreddit per month
monthly_avg_toxic = results_df.groupby(['year_month', 'subreddit_name'])['toxic_score'].mean().reset_index()
monthly_avg_toxic.rename(columns={'toxic_score': 'average_toxic_score'}, inplace=True)

# Define a custom color palette for the subreddits
custom_palette = {
    'r/singapore': '#4c72b0',
    'r/singaporeraw': '#55a868',
    'r/singaporehappenings': '#c44e52'
}

# Plot using seaborn with the custom color palette
plt.figure(figsize=(14, 8))
sns.lineplot(data=monthly_avg_toxic, x='year_month', y='average_toxic_score', hue='subreddit_name', 
             marker='o', palette=custom_palette)

# Formatting the plot
plt.xlabel('Year-Month')
plt.ylabel('Average Toxic Score')
plt.title('Average Toxic Score per Month by Subreddit')
plt.xticks(rotation=45)  # Rotate for readability
plt.legend(title='Subreddit', bbox_to_anchor=(1.05, 1), loc='upper left')  # Place legend outside the plot

# Add gridlines
plt.grid(True)

plt.tight_layout()
plt.show()

### Monthly Comment Count by Subreddit

In [None]:
# Aggregate to count the number of entries per subreddit per month
monthly_comment_count = results_df.groupby(['year_month', 'subreddit_name']).size().reset_index(name='comment_count')

# Plot using seaborn to handle grouping by subreddit
plt.figure(figsize=(14, 8))
sns.lineplot(data=monthly_comment_count, x='year_month', y='comment_count', hue='subreddit_name', marker='o', palette=custom_palette)

# Formatting the plot
plt.xlabel('Year-Month')
plt.ylabel('Number of Comments')
plt.title('Monthly Comment Count by Subreddit')
plt.xticks(rotation=45)  # Rotate for readability
plt.legend(title='Subreddit', bbox_to_anchor=(1.05, 1), loc='upper left')  # Place legend outside the plot
# Add gridlines
plt.grid(True)
plt.tight_layout()
plt.show()

### Number of Comments grouped by year

In [None]:
# count number of comments by year and month 
num_of_comments_per_month_by_year_df = reddit_df.groupby(['year', 'month'])['id'].count().reset_index()
# Plotting
plt.figure(figsize=(12, 8))

# Loop through each year and plot number of comments per month
for year in num_of_comments_per_month_by_year_df['year'].unique():
    data_by_year = num_of_comments_per_month_by_year_df[num_of_comments_per_month_by_year_df['year'] == year]
    plt.plot(data_by_year['month'], data_by_year['id'], marker='o', label=str(year))

# Customizing the plot
plt.xlabel('Month')
plt.ylabel('Number of Comments')
plt.title('Number of Comments per Month Grouped by Year')
plt.xticks(ticks=range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend(title='Year')
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

### Hateful and Toxic Score by timestamp boxplot

In [None]:
# List of dates to filter for
dates_to_filter = [
    datetime.date(2021, 12, 7),
    datetime.date(2020, 7, 10),
    datetime.date(2023, 7, 17),
    datetime.date(2021, 10, 20)
]

# Filter DataFrame for these specific dates
big_events_vs_average_day_df = results_df[results_df['timestamp'].isin(dates_to_filter)]

big_events_vs_average_day_df.head()

In [None]:
# Convert to datetime if it's not already
big_events_vs_average_day_df['timestamp'] = pd.to_datetime(big_events_vs_average_day_df['timestamp'], errors='coerce')

# Now, convert it to string format
big_events_vs_average_day_df['timestamp'] = big_events_vs_average_day_df['timestamp'].dt.strftime('%Y-%m-%d')

# Define a mapping for renaming timestamps
timestamp_mapping = {
    '2024-01-01': 'Event A',
    '2024-01-02': 'Event B',
    '2024-01-03': 'Event C',
    '2024-01-04': 'Event D'
}

# Rename the timestamps in the DataFrame using the mapping
big_events_vs_average_day_df['timestamp'] = big_events_vs_average_day_df['timestamp'].replace(timestamp_mapping)

# Ensure that the timestamp column is treated as a categorical variable
big_events_vs_average_day_df['timestamp'] = big_events_vs_average_day_df['timestamp'].astype('category')

# Melt the DataFrame to have a long format suitable for seaborn
melted_df = big_events_vs_average_day_df.melt(
    id_vars=['timestamp'],
    value_vars=['hateful_score', 'toxic_score'],
    var_name='score_type',
    value_name='score_value'
)

# Create a color palette
palette = {
    'hateful_score': 'red',
    'toxic_score': 'green'
}

# Create a boxplot with custom colors
plt.figure(figsize=(12, 6))
sns.boxplot(data=melted_df, x='timestamp', y='score_value', hue='score_type', palette=palette)

# Customize the plot
plt.title('Boxplot of Hateful Scores and Toxic Scores by Timestamp')
plt.xlabel('Timestamp')
plt.ylabel('Score Value')
plt.legend(title='Score Type')
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


### Distribution of Toxicity and Hatefulness Scores by Topic

In [None]:
topic_stats = results_df.groupby('topic')[['toxic_score', 'hateful_score']].agg(['mean', 'median', 'std']).reset_index()
topic_stats

In [None]:
# Set the figure and axes for two subplots (side by side)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# First subplot: Boxplot for 'toxic_score'
sns.boxplot(data=results_df, x='topic', y='toxic_score', ax=axes[0])
axes[0].set_title('Distribution of Toxicity Scores by Topic')
axes[0].tick_params(axis='x', rotation=45)

# Second subplot: Boxplot for 'hateful_score'
sns.boxplot(data=results_df, x='topic', y='hateful_score', ax=axes[1])
axes[1].set_title('Distribution of Hatefulness Scores by Topic')
axes[1].tick_params(axis='x', rotation=45)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)

# Show the plot
plt.show()

### Distribution of Toxic Score by content category

### Wordcloud based on content category

In [None]:
df = reddit_df
# get the title of the post that the comment is in (to sense what the post is talking about)
def get_title(link):
    parts = link.split('/')
    return parts[5]

# get the post that the comment is in (some posts have the same title)
def get_post(link):
    parts = link.split('/')
    subreddit = parts[2]
    post_id = parts[4]
    title = parts[5]
    return subreddit + ',' + post_id + ',' + title

df['title'] = df['link'].apply(get_title)
df['post'] = df['link'].apply(get_post)
df

In [None]:
counts = df['post'].value_counts()

# getting the top 10 posts with the most number of comments
print(counts[0:10])

In [None]:
"""
Post: singapore general elections 2020 polling results """


ge2020_poll_results = df[df['post'] == 'singapore,homxdq,singapore_general_elections_2020_polling_results']
print(ge2020_poll_results['text'])

#### Results
From the posts, the two large topics that come out are:

Singapore general elections 2020
posts (based on top 20):
'singapore_general_elections_2020_polling_results'
'ge2020_nomination_day_mega_thread'
'ge_2020_political_debate_megathread'
total number of comments (based on top 20 posts): 1154
COVID-19, especially on dining restrictions
posts (based on top 20):
'no_dining_in_social_gatherings_capped_at_2_people...'
'no_dining_in_social_group_sizes_cut_to_2_from...'
'pm_lee_to_address_nation_on_covid19_situation_and...'
'stabilisation_phase_extended_to_nov_21_more_time...'
'covid19_diningin_group_size_limit_at_regular_fb...'
'pm_lee_announces_new_stricter_restrictions_to...'
'those_unvaccinated_against_covid19_will_no_longer...'
'live_pm_lee_addresses_nation_on_covid19_situation...'
'covid19_task_force_evaluating_timing_and_scope_of...'
total number of comments (based on top 20 posts): 934

In [None]:
df_2 = reddit_df
df_2

In [None]:
sg = df_2[df_2['link'].str.startswith('/r/singapore')].reset_index(drop=True)
print(sg['link'][0])
sgraw = df_2[df_2['link'].str.startswith('/r/SingaporeRaw')].reset_index(drop=True)
print(sgraw['link'][0])
sghap = df_2[df_2['link'].str.startswith('/r/singaporehappenings')].reset_index(drop=True)
print(sghap['link'][0])

In [None]:
# get the title of the post that the comment is in (to sense what the post is talking about)
def get_title(link):
    parts = link.split('/')
    return parts[5]

# get the post that the comment is in (some posts have the same title)
def get_post(link):
    parts = link.split('/')
    subreddit = parts[2]
    post_id = parts[4]
    title = parts[5]
    return subreddit + ',' + post_id + ',' + title

df_2['title'] = df_2['link'].apply(get_title)
df_2['post'] = df_2['link'].apply(get_post)
df_2

In [None]:
counts = df_2['post'].value_counts()

# getting the top 10 posts with the most number of comments
print(counts[0:20])

In [None]:
ge2020_posts = df_2[df_2['post'].isin([
    'singapore,homxdq,singapore_general_elections_2020_polling_results',
    'singapore,hie65n,ge2020_nomination_day_mega_thread',
    'singapore,hj8h5p,ge_2020_political_debate_megathread',
])]

covid_posts = df_2[df_2['post'].isin([
    'singapore,nc0vwe,no_dining_in_social_gatherings_capped_at_2_people',  
    'singapore,onx8xr,no_dining_in_social_group_sizes_cut_to_2_from',
    'singapore,qbyerz,stabilisation_phase_extended_to_nov_21_more_time',  
    'singapore,puh01k,covid19_diningin_group_size_limit_at_regular_fb',
    'singapore,fu4ch0,pm_lee_announces_new_stricter_restrictions_to', 
    'singapore,q4e96u,those_unvaccinated_against_covid19_will_no_longer',   
    'singapore,tlvwx9,live_pm_lee_addresses_nation_on_covid19_situation',   
    'singapore,o104h0,covid19_task_force_evaluating_timing_and_scope_of',
    'singapore,nosjic,megathread_pm_lee_delivers_national_address_on'   
])]

nationalday_posts = df_2[df_2['post'].isin([
    'singapore,wjxxdd,megathread_national_day_parade_2022',
    'singapore,wtc8jy,megathread_national_day_rally_2022',
           
])]

lifestyle_stress_posts = df_2[df_2['post'].isin([
    'singapore,hs2ynr,this_is_basically_the_entirety_of_an_average',
    'singapore,q3204h,whats_the_point_of_bringing_a_life_into_singapore '
           
])]

violent_posts = df_2[df_2['post'].isin([
    'singapore,on8f8m,river_valley_high_school_student_killed_on_campus',
    'singapore,nj3o0j,someone_is_attacked_on_mrt'   
])]


misc_posts = df_2[df_2['post'].isin([
    'singapore,on2hbu,using_only_emojis_which_town_are_you_from'
])]


In [None]:
# List of DataFrames and their titles
dfs = {
    'ge2020_posts': ge2020_posts,
    'covid_posts': covid_posts,
    'nationalday_posts': nationalday_posts,
    'lifestyle_stress_posts': lifestyle_stress_posts,
    'violent_posts': violent_posts,
    'misc_posts': misc_posts
}

# Initialize lists to store results
categories = []
scores = []
means = []
modes = []
medians = []
percentile_25 = []
percentile_75 = []

# Loop through each DataFrame and calculate statistics
for category, df in dfs.items():
    # Calculate mean, mode, median, 25th percentile, and 75th percentile values
    for score_type in ['hateful Score', 'hateful HR', 'toxic Score', 'toxic HR']:
        means.append(df[score_type].mean())
        modes.append(df[score_type].mode()[0])
        medians.append(df[score_type].median())
        percentile_25.append(df[score_type].quantile(0.25))
        percentile_75.append(df[score_type].quantile(0.75))
        
        # Add the category and score type to corresponding lists
        categories.append(category)
        scores.append(score_type)

# Create the new DataFrame
data = {
    'Content Category': categories,
    'Score Type': scores,
    'Mean': means,
    'Mode': modes,
    'Median': medians,
    '25th Percentile': percentile_25,
    '75th Percentile': percentile_75
}

# Convert to DataFrame
new_df = pd.DataFrame(data)

# Display the new DataFrame
new_df

In [None]:
# Combine the individual DataFrames into one for plotting
dfs = {
    'ge2020_posts': ge2020_posts,
    'covid_posts': covid_posts,
    'nationalday_posts': nationalday_posts,
    'lifestyle_stress_posts': lifestyle_stress_posts,
    'violent_posts': violent_posts,
    'misc_posts': misc_posts
}

# Add a 'Content Category' column to each DataFrame and concatenate them
for category, df in dfs.items():
    df['Content Category'] = category

# Concatenate all DataFrames
combined_df = pd.concat(dfs.values(), ignore_index=True)

# Melt the DataFrame to bring it into a long format for Seaborn
long_df = combined_df.melt(id_vars=['Content Category'], 
                           value_vars=['hateful Score', 'toxic Score'],
                           var_name='Score Type', 
                           value_name='Score')

# Filter for hateful Score and toxic Score
hateful_df = long_df[long_df['Score Type'] == 'hateful Score']
toxic_df = long_df[long_df['Score Type'] == 'toxic Score']

# Plot the box plot for hateful Score
plt.figure(figsize=(12, 6))
sns.boxplot(data=hateful_df, x='Score Type', y='Score', hue='Content Category')
plt.title('Distribution of Hateful Score by Content Category')
plt.xlabel('Score Type')
plt.ylabel('Score')
plt.legend(title='Content Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Plot the box plot for toxic Score
plt.figure(figsize=(12, 6))
sns.boxplot(data=toxic_df, x='Score Type', y='Score', hue='Content Category')
plt.title('Distribution of Toxic Score by Content Category')
plt.xlabel('Score Type')
plt.ylabel('Score')
plt.legend(title='Content Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

### Wordcloud of most frequent qord for each category

In [None]:
# Ensure stopwords are downloaded (if not already done)
nltk.download('stopwords')
common_stopwords = set(stopwords.words('english'))


# Example data structure for multiple posts, adjust accordingly
posts_data = [
    (ge2020_posts, 'GE2020 Posts'),
    (covid_posts, 'COVID Posts'),
    (nationalday_posts, 'National Day Posts'),
    (lifestyle_stress_posts, 'Lifestyle Stress Posts'),
    (violent_posts, 'Violent Posts'),
    (misc_posts, 'Miscellaneous Posts')
]

In [None]:
# Number of subplots based on the length of posts_data
n_posts = len(posts_data)
n_cols = 2
n_rows = math.ceil(n_posts / n_cols)

# Function to preprocess text by removing common stopwords
def preprocess_text(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in common_stopwords]
    return ' '.join(filtered_words)

# Preprocess each DataFrame
for i, (df, _) in enumerate(posts_data):
    df['text'] = df['text'].fillna('').apply(preprocess_text)  # Remove NaN values and apply stop word removal

In [None]:
# Number of subplots based on the length of posts_data
n_posts = len(posts_data)
n_cols = 2
n_rows = math.ceil(n_posts / n_cols)

plt.figure(figsize=(15, 7 * n_rows))

# Loop through the DataFrames to generate word clouds
for i, (df, title) in enumerate(posts_data):
    # Create TF-IDF Vectorizer (without preprocessor)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=500)

    # Fill NaN values and fit-transform the data
    df['text'] = df['text'].fillna('')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].values)

    # Sum the TF-IDF scores for each term across all documents
    tfidf_sum = tfidf_matrix.sum(axis=0)

    # Get words and corresponding TF-IDF scores
    words = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_sum.A1  # Convert to a 1D array
    word_scores = dict(zip(words, tfidf_scores))

    # Generate the word cloud
    wordcloud = WordCloud(width=1500, height=800, background_color='white', colormap='viridis').generate_from_frequencies(word_scores)

    # Plot the word cloud
    plt.subplot(n_rows, n_cols, i + 1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Hide the axes
    plt.title(title)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Create a figure to hold the subplots
plt.figure(figsize=(18, 12))

# Loop through the DataFrames to generate word clouds
for i, (df, title) in enumerate(posts_data):
    # Filter for toxic comments with a score greater than 0
    toxic_comments = df[df['toxic Score'] > 0]
    
    if toxic_comments.empty:  # Skip if there are no toxic comments
        continue

    # Create TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=500)

    # Fill NaN values and fit-transform the data
    toxic_comments['text'] = toxic_comments['text'].fillna('')
    tfidf_matrix = tfidf_vectorizer.fit_transform(toxic_comments['text'].values)

    # Sum the TF-IDF scores for each term across all documents
    tfidf_sum = tfidf_matrix.sum(axis=0)

    # Get words and corresponding TF-IDF scores
    words = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_sum.A1  # Convert to a 1D array
    word_scores = dict(zip(words, tfidf_scores))

    # Generate the word cloud
    wordcloud = WordCloud(width=1500, height=800, background_color='white', colormap='viridis').generate_from_frequencies(word_scores)

    # Plot the word cloud
    plt.subplot(3, 2, i + 1)  # Arrange plots in 3 rows and 2 columns
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Hide the axes
    plt.title(title)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Assuming you have separate DataFrames for each content category
posts_data = [
    (ge2020_posts, 'GE2020 Posts'),
    (covid_posts, 'COVID Posts'),
    (nationalday_posts, 'National Day Posts'),
    (lifestyle_stress_posts, 'Lifestyle Stress Posts'),
    (violent_posts, 'Violent Posts'),
    (misc_posts, 'Miscellaneous Posts')
]

# Create a figure to hold the subplots
plt.figure(figsize=(18, 12))

# Loop through the DataFrames to generate word clouds
for i, (df, title) in enumerate(posts_data):
    # Filter for hateful comments with a score greater than 0
    hateful_comments = df[df['hateful Score'] > 0]
    
    if hateful_comments.empty:  # Skip if there are no hateful comments
        continue

    # Create TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=500)

    # Fill NaN values and fit-transform the data
    hateful_comments['text'] = hateful_comments['text'].fillna('')
    tfidf_matrix = tfidf_vectorizer.fit_transform(hateful_comments['text'].values)

    # Sum the TF-IDF scores for each term across all documents
    tfidf_sum = tfidf_matrix.sum(axis=0)

    # Get words and corresponding TF-IDF scores
    words = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_sum.A1  # Convert to a 1D array
    word_scores = dict(zip(words, tfidf_scores))

    # Generate the word cloud
    wordcloud = WordCloud(width=1500, height=800, background_color='white', colormap='viridis').generate_from_frequencies(word_scores)

    # Plot the word cloud
    plt.subplot(3, 2, i + 1)  # Arrange plots in 3 rows and 2 columns
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Hide the axes
    plt.title(title)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# Assuming `ge2020_posts` is your DataFrame containing the toxic comments
# Filter the DataFrame to get comments with a toxic score greater than 0
toxic_comments = ge2020_posts[ge2020_posts['toxic Score'] > 0]

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=500)

# Fill NaN values and fit-transform the data
toxic_comments['text'] = toxic_comments['text'].fillna('')
tfidf_matrix = tfidf_vectorizer.fit_transform(toxic_comments['text'].values)

# Sum the TF-IDF scores for each term across all documents
tfidf_sum = tfidf_matrix.sum(axis=0)

# Get words and corresponding TF-IDF scores
words = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_sum.A1  # Convert to a 1D array
word_scores = dict(zip(words, tfidf_scores))

# Generate the word cloud
wordcloud = WordCloud(width=1500, height=800, background_color='white', colormap='viridis').generate_from_frequencies(word_scores)

# Plot the word cloud
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hide the axes
plt.title('Word Cloud for Most Toxic Comments Overall')
plt.show()

In [None]:
# Assuming `ge2020_posts` is your DataFrame containing the hateful comments
# Filter the DataFrame to get comments with a hateful score greater than 0
hateful_comments = ge2020_posts[ge2020_posts['hateful Score'] > 0]

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=500)

# Fill NaN values and fit-transform the data
hateful_comments['text'] = hateful_comments['text'].fillna('')
tfidf_matrix = tfidf_vectorizer.fit_transform(hateful_comments['text'].values)

# Sum the TF-IDF scores for each term across all documents
tfidf_sum = tfidf_matrix.sum(axis=0)

# Get words and corresponding TF-IDF scores
words = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_sum.A1  # Convert to a 1D array
word_scores = dict(zip(words, tfidf_scores))

# Generate the word cloud
wordcloud = WordCloud(width=1500, height=800, background_color='white', colormap='viridis').generate_from_frequencies(word_scores)

# Plot the word cloud
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hide the axes
plt.title('Word Cloud for Most Hateful Comments Overall')
plt.show()

### Monthly Average Toxic Scores and Percentage of Comments Manually Removed

In [None]:
# Group by day and calculate the average score
average_toxicity_and_moderation_per_day_df = results_df.groupby(results_df['year_month']).agg(
    average_num_removed=('is_manual_removed', 'mean'), 
    average_toxic_score=('toxic_score', 'mean'),
    average_hateful_score=('hateful_score', 'mean')
).reset_index()

# create percentage column
average_toxicity_and_moderation_per_day_df['removed_percentage'] = average_toxicity_and_moderation_per_day_df['average_num_removed']*100

# Convert the 'year_month' column to string or datetime format
average_toxicity_and_moderation_per_day_df['year_month'] = average_toxicity_and_moderation_per_day_df['year_month'].astype(str)

# Convert 'year_month' column to datetime format
average_toxicity_and_moderation_per_day_df['year_month'] = pd.to_datetime(average_toxicity_and_moderation_per_day_df['year_month'])

average_toxicity_and_moderation_per_day_df

In [None]:
# Create the plot
fig, ax1 = plt.subplots()

# Plot the first DataFrame on the left y-axis with transparency
ax1.plot(average_toxicity_and_moderation_per_day_df['year_month'], 
         average_toxicity_and_moderation_per_day_df['average_toxic_score'], 
         label='Toxic Score', color='green', alpha=0.7)
ax1.set_xlabel('Year')
ax1.set_ylabel('Average Toxic Score', color='green')
ax1.tick_params(axis='y', labelcolor='green')

# Rotate the tick labels more (ensure this is applied to ax1)
plt.xticks(rotation=60)  # Rotate by 60 degrees (you can adjust as needed)

# Create a second y-axis for the second variable with transparency
ax2 = ax1.twinx()
ax2.plot(average_toxicity_and_moderation_per_day_df['year_month'], 
         average_toxicity_and_moderation_per_day_df['removed_percentage'], 
         label='Number of Comments Manually Removed (%)', color='black', alpha=0.7)
ax2.set_ylabel('Number of Comments Manually Removed (%)', color='black')
ax2.tick_params(axis='y', labelcolor='black')

# Format x-axis to show monthly ticks and set them quarterly (every 3 months)
ax1.xaxis.set_major_locator(mdates.MonthLocator(bymonthday=15, interval=3))  # Quarterly ticks
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  # Format as year-month

# Turn off x-axis tick labels for ax2 to prevent duplication
ax2.get_xaxis().set_visible(False)

# Set title
plt.title('Monthly Average Toxic Scores and Percentage of Comments Manually Removed')

# Use tight_layout for better spacing
fig.tight_layout()

# Show the plot
plt.show()

### Toxicity Score against % Comments Manually Removed by Username

In [None]:
# Aggregate to count the number of manually removed comments per subreddit per month
monthly_manual_removed = results_df.groupby(['year_month', 'subreddit_name'])['is_manual_removed'].sum().reset_index()
monthly_manual_removed.rename(columns={'is_manual_removed': 'count_manual_removed'}, inplace=True)

# Plot using seaborn with the custom color palette
plt.figure(figsize=(14, 8))
sns.lineplot(data=monthly_manual_removed, x='year_month', y='count_manual_removed', hue='subreddit_name', 
             marker='o', palette=custom_palette)

# Formatting the plot
plt.xlabel('Year-Month')
plt.ylabel('Count of Manually Removed Comments')
plt.title('Monthly Count of Manually Removed Comments by Subreddit')
plt.xticks(rotation=45)  # Rotate for readability
plt.legend(title='Subreddit', bbox_to_anchor=(1.05, 1), loc='upper left')  # Place legend outside the plot
# Add gridlines
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Ensure 'is_manual_removed' is treated as a numeric column (if it's not already)
results_df['is_manual_removed'] = results_df['is_manual_removed'].astype(float)

# Group by 'username' and calculate the mean for 'toxic_score' and percentage for 'is_manual_removed'
average_toxic_data_by_user = results_df.groupby('username').agg(
    average_toxic_score=('toxic_score', 'mean'),
    percentage_is_manual_removed=('is_manual_removed', 'mean'),
    most_frequent_subreddit=('subreddit_name', lambda x: x.value_counts().idxmax())
)

# Convert mean to percentage
average_toxic_data_by_user['percentage_is_manual_removed'] *= 100

average_toxic_data_by_user.head()

In [None]:
palette = {'r/singapore': "#4c72b0", 'r/singaporeraw': "#55a868", 'r/singaporehappenings': "#c44e52"}

# Plot the scatter plot
plt.figure(figsize=(10, 6))

# Map the 'most_frequent_subreddit' to colors based on the palette
colors = average_toxic_data_by_user['most_frequent_subreddit'].map(palette)

# Plot with colors based on the most frequent subreddit
plt.scatter(
    average_toxic_data_by_user['average_toxic_score'], 
    average_toxic_data_by_user['percentage_is_manual_removed'], 
    c=colors, 
    alpha=0.6
)

# Add labels and title
plt.xlabel('Average Toxic Score')
plt.ylabel('Percentage of Comments Manually Removed')
plt.title('Scatter Plot of Average Toxic Score vs. Percentage of Comments Manually Removed by Username')

# Create legend for subreddits
legend_elements = [Patch(facecolor=color, label=subreddit) for subreddit, color in palette.items()]
plt.legend(handles=legend_elements, title="Subreddit")

# Show plot
plt.show()

### Post Count by subreddit

In [None]:
# Group the data by 'subreddit_id' and get the count for each subreddit
subreddit_counts = reddit_df.groupby('subreddit_id').size().reset_index(name='count')

# Display the result
print(subreddit_counts)