## Data Analysis

In [1]:
import json
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm
tqdm.pandas()

In [2]:
full_data = pd.read_csv("0_resources/0_dataset/full.csv")

### Subreddit

In [3]:
print(f"Number of subreddits included in the data: {len(full_data.subreddit.unique())}")

Number of subreddits included in the data: 26


In [4]:
SUBREDDIT_CUTOFF = 5

In [5]:
subreddits = pd.DataFrame(
    {
        'Comments': full_data['subreddit'].value_counts(),
        '% of Data': full_data['subreddit'].value_counts(normalize=True)*100
    },
    index=full_data['subreddit'].value_counts().index
)
other_count = np.sum(subreddits[subreddits['Comments'] < SUBREDDIT_CUTOFF].Comments)
others = list(subreddits[subreddits['Comments'] < SUBREDDIT_CUTOFF].index)
subreddits = subreddits[subreddits['Comments'] > SUBREDDIT_CUTOFF]

subreddits.loc['Other'] = [other_count,  other_count/30]

#### Calculate % of Attacks

In [6]:
def calculate_pct_by_grouper(df, col_to_group_by, target_col='maj_label_attack'):
    raw = df.groupby([col_to_group_by, target_col]).id.count().unstack(fill_value=0).stack()
    summary_df = pd.DataFrame(raw, columns=['Comments'])
    total = df.groupby([col_to_group_by]).id.count()
    summary_df['% of Selection'] = raw.div(total, level=col_to_group_by) * 100
    return summary_df

In [7]:
attack_summary_df = calculate_pct_by_grouper(full_data, 'subreddit')
attacks = []
for idx, row in subreddits.iterrows():
    try:
        attacks.append(attack_summary_df.loc[idx].loc['Yes']['% of Selection'])
    except:
        other_attacks = []
        for other_subreddit in others:
            other_attacks.append(attack_summary_df.loc[other_subreddit].loc['Yes']['% of Selection'])
        attacks.append(np.mean(other_attacks))    
            
subreddits['% Attacks'] = attacks

In [8]:
pd.options.display.float_format = "{:,.1f}".format
subreddits

Unnamed: 0,Comments,% of Data,% Attacks
indonesia,12561.0,83.7,39.7
malaysia,1389.0,9.3,51.1
malaygonewild,272.0,1.8,61.4
singapore,239.0,1.6,26.8
MalaysGoneWild,201.0,1.3,54.2
Ajar_Malaysia,89.0,0.6,23.6
MalaysianFappers,49.0,0.3,57.1
malaysians,35.0,0.2,37.1
NegarakuMalaysia,35.0,0.2,37.1
SeksiArtisMalaysia,24.0,0.2,79.2


In [9]:
# for idx, row in subreddits.iterrows():
#     print(f"{idx} & {int(row[0]):,.0f} & {row[1]:.1f} & {row[2]:.1f} \\\\")

### Language

In [10]:
LANGUAGE_CUTOFF = 10

In [11]:
print(f"Number of unique language combinations included in the data: {len(full_data.final_label_language.unique())}")

Number of unique language combinations included in the data: 88


In [12]:
# full_data.final_label_language.unique()

In [13]:
languages = pd.DataFrame(
    {
        'Comments': full_data['final_label_language'].value_counts(),
        '% of Data': full_data['final_label_language'].value_counts(normalize=True)*100
    },
    index=full_data['final_label_language'].value_counts().index
)
other_count = np.sum(languages[languages['Comments'] < LANGUAGE_CUTOFF].Comments)
others = list(languages[languages['Comments'] < LANGUAGE_CUTOFF].index)
languages = languages[languages['Comments'] > LANGUAGE_CUTOFF]

languages.loc['Other Languages'] = [other_count,  other_count/30]

In [14]:
pd.options.display.float_format = "{:,.1f}".format
languages

Unnamed: 0,Comments,% of Data
Indonesian,12212.0,81.4
Malay,1635.0,10.9
"('English', 'Indonesian')",220.0,1.5
Singlish,218.0,1.5
Javanese,92.0,0.6
"['English', 'Indonesian']",90.0,0.6
"['Indonesian', 'English']",86.0,0.6
English,85.0,0.6
"['Malay', 'English']",84.0,0.6
Sundanese,46.0,0.3


In [15]:
# for idx, row in languages.iterrows():
#     print(f"{idx} & {int(row[0]):,.0f} & {row[1]:.1f} \\\\")

### Time

In [16]:
full_data['year'] = full_data['date'].apply(lambda x: x[:-6])
print(f"Number of years included in the data: {len(full_data.year.unique())}")

Number of years included in the data: 12


In [17]:
timestamps = list(full_data['date'].unique())
timestamps.sort()
print(f"Earliest comment in the data: {timestamps[0]}. Most recent comment: {timestamps[-1]}")

Earliest comment in the data: 2011-05-19. Most recent comment: 2022-08-31


In [18]:
time = pd.DataFrame(
    {
        'Comments': full_data['year'].value_counts(),
        '% of Data': full_data['year'].value_counts(normalize=True)*100
    },
    index=full_data['year'].value_counts().index
)

In [19]:
attack_summary_df = calculate_pct_by_grouper(full_data, 'year')
attacks = []
for idx, row in time.iterrows():
    attacks.append(attack_summary_df.loc[idx].loc['Yes']['% of Selection'])
            
time['% Attacks'] = attacks
time = time.sort_index(ascending=False)

In [20]:
time

Unnamed: 0,Comments,% of Data,% Attacks
2022,3672,24.5,38.7
2021,4142,27.6,39.6
2020,3028,20.2,42.1
2019,2084,13.9,40.9
2018,1076,7.2,46.7
2017,705,4.7,51.9
2016,101,0.7,44.6
2015,113,0.8,39.8
2014,61,0.4,32.8
2013,10,0.1,30.0


In [21]:
# for idx, row in time.iterrows():
#     print(f"{idx} & {int(row[0]):,.0f} & {row[1]:.1f} & {row[2]:.1f} \\\\")

### User

In [22]:
users = len(full_data.author.unique())
print(f"Number of individual users contributing to the data: {users:,.0f}")

Number of individual users contributing to the data: 5,307


In [23]:
author_counts = full_data.groupby(['author']).id.count()
counts, bins = np.histogram(author_counts, bins=np.linspace(1,11,11))
labels = ['1','2','3','4','5','6','7','8','9','10+']
author = pd.DataFrame(zip(labels,counts,(counts/users)*100), columns=['Comments', 'Users', '% of Users'])
author.set_index('Comments')

Unnamed: 0_level_0,Users,% of Users
Comments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3303,62.2
2,763,14.4
3,376,7.1
4,194,3.7
5,150,2.8
6,105,2.0
7,63,1.2
8,46,0.9
9,36,0.7
10+,70,1.3


In [24]:
# for idx, row in author.iterrows():
#     print(f"{row[0]} & {int(row[1]):,.0f} & {row[2]:.1f} \\\\")

In [25]:
print(f"Comments from the user most often represented in the dataset: {author_counts.max():,.0f}")

Comments from the user most often represented in the dataset: 179


### Attacks

In [26]:
attacks = len(full_data[full_data['maj_label_attack'] == 'Yes'])
print(f"{attacks:,.0f} ({(attacks/len(full_data))*100:.2f}%) comments were majority-labelled as containing attacks.")

6,173 (41.15%) comments were majority-labelled as containing attacks.


In [27]:
def convert_to_majority(entry):
    try:
        entry = json.loads(entry.replace('nan', "'nan'").replace('\'', "\""))
        if Counter(entry)['Yes'] > 1:
            return 1
    except: 
        if (entry.count('[nan,')+entry.count(' nan,')+entry.count('nan]')) < 1:
            return 1
    return 0

In [28]:
attack_types = [
    ('attack_person', 'Attack on a Person'),
    ('attack_media', 'Attack on Media'), 
    ('attack_protected', 'Attack on a Protected Group'),
    ('attack_institution', 'Attack on an Institution'), 
    ('attack_other', 'Other Attacks')
]

In [29]:
for attack_type, attack_description in attack_types:
    full_data[attack_type + "_maj"] = full_data[attack_type].progress_apply(lambda x: convert_to_majority(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 408931.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 484479.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 488489.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 477555.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 465117.32it/s]


In [30]:
len(full_data[full_data["attack_person_maj"] == 1])

4356

In [31]:
attack_type_analysis = []
for attack_type, attack_description in attack_types:
    results = [attack_description]
    entries = full_data[full_data[attack_type + "_maj"] == 1]
    results.append(len(entries))
    results.append((len(entries)/attacks)*100)
    attack_type_analysis.append(results)
df_columns = ['Attack Type', 'Comments', '% of Data']
attack_type_analysis = pd.DataFrame(attack_type_analysis, columns=df_columns)

In [32]:
attack_type_analysis.set_index('Attack Type')

Unnamed: 0_level_0,Comments,% of Data
Attack Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Attack on a Person,4356,70.6
Attack on Media,78,1.3
Attack on a Protected Group,534,8.7
Attack on an Institution,428,6.9
Other Attacks,14,0.2


In [33]:
# for idx, row in attack_type_analysis.iterrows():
#     print(f"{row[0]} & {int(row[1]):,.0f} & {row[2]:.1f} \\\\")

In [34]:
no_attack_type = len(
    full_data[
        (full_data['maj_label_attack'] == 'Yes') &
        (full_data['attack_person_maj'] == 0) & 
        (full_data['attack_media_maj'] == 0) & 
        (full_data['attack_protected_maj'] == 0) & 
        (full_data['attack_institution_maj'] == 0) & 
        (full_data['attack_other_maj'] == 0) 
    ]
)

In [35]:
print(f"Comments majority-voted as containing attacks without a majority-voted attack type: {no_attack_type:,.0f}")

Comments majority-voted as containing attacks without a majority-voted attack type: 1,199
