# Setup

In [1]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import ast
import seaborn as sns

Read in 2 dfs

In [2]:
# read in with removed comments df
with_removed_df = pd.read_csv('../../data/Reddit_cleaned_includes_removed.csv',  lineterminator='\n', encoding='utf8')

# read in with topic modeeling results df
topic_results_df = pd.read_csv('../../data/topic_model_results.csv',  lineterminator='\n', encoding='utf8')

In [3]:
print(with_removed_df.shape)
print(topic_results_df.shape)

(439642, 9)
(396298, 15)


Explore with removed df

In [4]:
with_removed_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r
0,i think most singaporeans dont give a damn who...,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interest...,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '..."
1,fair point the secrecy aspect of it slipped my...,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_tab...,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '..."
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defens...,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '..."
3,removed,2020-01-03 16:44:46,[deleted],/r/singapore/comments/ej54sm/rsingapore_random...,t3_ej54sm,t1_fcxcdrx,fcxsm0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c..."
4,gt this is binary thinking because you think t...,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '..."


In [5]:
# Filter the DataFrame where 'text' contains 'removed'
filtered_df = with_removed_df[with_removed_df['text'].str.contains('removed', case=False, na=False)]

# Get the unique usernames
unique_usernames = filtered_df['username'].unique()

In [6]:
print(unique_usernames)

['[deleted]' 'AutoModerator' 'Pvt_Twinkietoes' 'Taellion'
 'ModeratelyHelpfulBot' 'aswlwlwl' 'raptorized' 'tatabusa'
 'KOREANPUBLICSCHOOL' 'inspektordi' 'mildfull' 'bonkers05' 'ybct'
 'rockymountain05' 'troublesome58' 'QzSG' 'dodgethis_sg' 'ryuuheii'
 'prime5119' 'DuePomegranate' 'KeythKatz' '-_af_-' 'Dependent-Analysis92'
 'charroxgrin' 'sageadam' 'Strangeronthebus2019' 'Embarrassed-Chain268'
 'pannerin' 'woshiooqi' 'smolfluffyhakutaku' 'thorsten139'
 'eccentric_eggplant' 'MangoDangoLango' 'coolbakerguy97' 'FanoftheFalls'
 'Foxie13x' 'ngrenjie' 'linglingmozartybae' 'lawlianne' 'uncleemperor'
 'deepfriedceleron' 'sp______ce' 'deangsana' 'Traxgen' 'darklajid'
 'IAmNoTMistaken77' 'RedFaceGeneral' 'zachtan1234' 'jieqint'
 'hornychestnut' 'pjayaredee' 'brownriver12' 'oofmewho' 'yewjrn'
 'crescentgirl' 'Swanriverperth' 'BreakWindow' 'TheMisterPotato'
 'ElopeToTheMoon' 'cooksncremeshake' 'LozSpagatee' 'killing_my_dreams'
 'philosophyOfOneness' 'sneakpeek_bot' 'Projectenzo' 'teestooshort'
 'P

In [7]:
# Get the count of unique usernames
unique_username_count = filtered_df['username'].nunique()

In [8]:
print("Count of unique usernames:", unique_username_count)

Count of unique usernames: 534


add new column

In [9]:
with_removed_df['is_manual_removed'] = with_removed_df['text'].str.contains('removed', case=False, na=False).astype(bool)

In [10]:
with_removed_df[with_removed_df['is_manual_removed'] == True].head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,is_manual_removed
3,removed,2020-01-03 16:44:46,[deleted],/r/singapore/comments/ej54sm/rsingapore_random...,t3_ej54sm,t1_fcxcdrx,fcxsm0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True
35,removed,2021-03-03 01:07:41,[deleted],/r/singapore/comments/lvx1ai/london_mayor_earn...,t3_lvx1ai,t3_lvx1ai,gphdjxb,t5_2qh8c,"{'collapsed_reason_code': 'DELETED', 'collapse...",True
44,removed,2020-07-11 12:51:44,[deleted],/r/singapore/comments/hp74jj/the_hero_we_need_...,t3_hp74jj,t1_fxn50q9,fxnuenc,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True
47,your submission has been removed because it do...,2020-07-01 13:12:38,AutoModerator,/r/singapore/comments/hj9wat/is_18m_considered...,t3_hj9wat,t3_hj9wat,fwkw3sw,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",True
60,removed,2020-12-26 09:44:10,[deleted],/r/singapore/comments/kk6tmr/rsingapore_random...,t3_kk6tmr,t1_gh1clxk,gh282rq,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True


In [11]:
with_removed_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,is_manual_removed
0,i think most singaporeans dont give a damn who...,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interest...,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False
1,fair point the secrecy aspect of it slipped my...,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_tab...,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defens...,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False
3,removed,2020-01-03 16:44:46,[deleted],/r/singapore/comments/ej54sm/rsingapore_random...,t3_ej54sm,t1_fcxcdrx,fcxsm0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True
4,gt this is binary thinking because you think t...,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False


merge the 2 dfs

In [12]:
topic_results_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,hateful Score,hateful HR,toxic Score,toxic HR\r,Topic Number,Topic\r
0,i think most singaporeans dont give a damn who...,11/4/2020 15:49,invigo79,/r/singapore/comments/fz7vtl/im_quite_interest...,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",-0.582897,0,-0.419338,0,0,Political\r
1,fair point the secrecy aspect of it slipped my...,3/4/2020 9:59,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_tab...,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",-1.116736,0,-1.869363,0,4,Relationships\r
2,range,15/2/2020 15:07,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defens...,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",-1.027191,0,-0.798016,0,4,Relationships\r
3,gt this is binary thinking because you think t...,4/6/2020 7:07,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",-0.419287,0,1.119165,1,5,Crime\r
4,boo boo poor u lmao,31/10/2020 13:52,pirorok,/r/singapore/comments/jl6abo/rsingapore_random...,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",-0.952112,0,1.197503,1,9,Unknown\r


In [13]:
# List of columns to set to NaN for entries with "removed" in 'text'
columns_to_set_na = ['hateful Score', 'hateful HR', 'toxic Score', 'toxic HR\r', 'Topic Number', 'Topic\r']

# Create a mask for entries with "removed" in the 'text' column
removed_mask = with_removed_df['text'].str.contains('removed', case=False, na=False)

# Set these columns to NaN in a copy of topic_results_df before merging
topic_results_df.loc[removed_mask, columns_to_set_na] = np.nan

In [17]:
# Merge the two DataFrames on 'id', keeping all rows from both and using suffixes to differentiate any duplicate column names
results_df = pd.merge(with_removed_df, topic_results_df, on='id', how='outer', suffixes=('', '_duplicate'))

# Drop any duplicate columns created from the merge
# (for example, columns in topic_results_df that already exist in with_removed_df)
for col in results_df.columns:
    if '_duplicate' in col:
        results_df.drop(col, axis=1, inplace=True)

In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Adjust width to avoid truncation

In [30]:
print(results_df)

                                                     text  \
0                                  the female or male cry   
1                                              double pay   
2                           this is one crazy bad morning   
3                                                 removed   
4       the crazy thing is most people on pmd really d...   
...                                                   ...   
439637  are you really gay or just playing around for ...   
439638  dont give me whatsboutism\n\nit is not traditi...   
439639  excellent should be compulsory for all singapo...   
439640                                                yes   
439641                             report to police db no   

                  timestamp        username  \
0       2020-01-01 00:12:59    Tempestuous-   
1       2020-01-01 00:38:33    Plyergamer27   
2       2020-01-01 00:42:09         gmdotes   
3       2020-01-01 00:48:57       [deleted]   
4       2020-01-01 00:55:10     

In [None]:
# to check if merge was successful
results_df[results_df['id']=='fn3gbrg']

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,is_manual_removed,hateful Score,hateful HR,toxic Score,toxic HR\r,Topic Number,Topic\r
29632,i think most singaporeans dont give a damn who...,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interest...,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-0.582897,0.0,-0.419338,0.0,0.0,Political\r


In [34]:
results_df[results_df['id']=='fcxsm0g']

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,is_manual_removed,hateful Score,hateful HR,toxic Score,toxic HR\r,Topic Number,Topic\r
529,removed,2020-01-03 16:44:46,[deleted],/r/singapore/comments/ej54sm/rsingapore_random...,t3_ej54sm,t1_fcxcdrx,fcxsm0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True,,,,,,


follow the rest of setup

In [35]:
# Remove \r from column names
results_df.columns = results_df.columns.str.strip()

# Strip \r and other whitespace characters from a specific column (e.g., 'column_name')
results_df['Topic'] = results_df['Topic'].str.strip()

results_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation,is_manual_removed,hateful Score,hateful HR,toxic Score,toxic HR,Topic Number,Topic
0,the female or male cry,2020-01-01 00:12:59,Tempestuous-,/r/singapore/comments/ei9klf/rsingapore_random...,t3_ei9klf,t1_fcohcxx,fcoor03,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.216313,0.0,-0.181924,0.0,2.0,Race & Religion
1,double pay,2020-01-01 00:38:33,Plyergamer27,/r/singapore/comments/ei5lec/lets_take_a_momen...,t3_ei5lec,t3_ei5lec,fcoqujj,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",False,-1.063543,0.0,-0.499782,0.0,2.0,Race & Religion
2,this is one crazy bad morning,2020-01-01 00:42:09,gmdotes,/r/singapore/comments/ehsyiu/rsingapore_random...,t3_ehsyiu,t1_fcnrh63,fcor57l,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.007145,0.0,-0.369454,0.0,4.0,Relationships
3,removed,2020-01-01 00:48:57,[deleted],/r/singapore/comments/eibm65/premiumflea_marke...,t3_eibm65,t3_eibm65,fcorov8,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True,,,,,,
4,the crazy thing is most people on pmd really d...,2020-01-01 00:55:10,jupiter1_,/r/singapore/comments/ei0oby/teen_pmd_rider_hi...,t3_ei0oby,t3_ei0oby,fcos6ik,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.078415,0.0,-0.348272,0.0,1.0,Covid-19


In [36]:
# rename new columns
results_df.rename(columns={
    'hateful Score': 'hateful_score',
    'hateful HR': 'hateful_prediction',
    'toxic Score': 'toxic_score',
    'toxic HR': 'toxic_prediction',
    'Topic Number': 'topic_number',
    'Topic': 'topic'
}, inplace=True)

results_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation,is_manual_removed,hateful_score,hateful_prediction,toxic_score,toxic_prediction,topic_number,topic
0,the female or male cry,2020-01-01 00:12:59,Tempestuous-,/r/singapore/comments/ei9klf/rsingapore_random...,t3_ei9klf,t1_fcohcxx,fcoor03,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.216313,0.0,-0.181924,0.0,2.0,Race & Religion
1,double pay,2020-01-01 00:38:33,Plyergamer27,/r/singapore/comments/ei5lec/lets_take_a_momen...,t3_ei5lec,t3_ei5lec,fcoqujj,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",False,-1.063543,0.0,-0.499782,0.0,2.0,Race & Religion
2,this is one crazy bad morning,2020-01-01 00:42:09,gmdotes,/r/singapore/comments/ehsyiu/rsingapore_random...,t3_ehsyiu,t1_fcnrh63,fcor57l,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.007145,0.0,-0.369454,0.0,4.0,Relationships
3,removed,2020-01-01 00:48:57,[deleted],/r/singapore/comments/eibm65/premiumflea_marke...,t3_eibm65,t3_eibm65,fcorov8,t5_2qh8c,"{'removal_reason': None, 'collapsed': True, 'c...",True,,,,,,
4,the crazy thing is most people on pmd really d...,2020-01-01 00:55:10,jupiter1_,/r/singapore/comments/ei0oby/teen_pmd_rider_hi...,t3_ei0oby,t3_ei0oby,fcos6ik,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,-1.078415,0.0,-0.348272,0.0,1.0,Covid-19


In [39]:
# convert data to appropriate datatypes
results_df['text'] = results_df['text'].astype(str)
results_df['timestamp'] = pd.to_datetime(results_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
results_df['username'] = results_df['username'].astype(str)
results_df['link'] = results_df['link'].astype(str)
results_df['link_id'] = results_df['link_id'].astype(str)
results_df['parent_id'] = results_df['parent_id'].astype(str)
results_df['id'] = results_df['id'].astype(str)
results_df['subreddit_id'] = results_df['subreddit_id'].astype(str)
results_df['moderation'] = results_df['moderation'].tolist()
results_df['hateful_score'] = results_df['hateful_score'].astype('float32')
results_df['hateful_prediction'] = results_df['hateful_prediction'].astype('Int16')  # Nullable integer type
results_df['toxic_score'] = results_df['toxic_score'].astype('float32')
results_df['toxic_prediction'] = results_df['toxic_prediction'].astype('Int16')  # Nullable integer type
results_df['topic_number'] = results_df['topic_number'].astype('category')
results_df['topic'] = results_df['topic'].astype('category')

In [40]:
# Group by year and month to track moderation actions over time
results_df['year_month'] = results_df['timestamp'].dt.to_period('M')

# extract date from datetime stamp
results_df['timestamp'] = results_df['timestamp'].dt.date

results_df['timestamp'].head()

0    2020-01-01
1    2020-01-01
2    2020-01-01
3    2020-01-01
4    2020-01-01
Name: timestamp, dtype: object

# 1. Toxic and manual moderated across time plot

# 2. manual moderation on big 3 events vs "average" day

# 3. plot manual moderation against toxicity