In [85]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
import json
import itertools
import os
import warnings

import numpy as np
import pandas as pd

from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

from data import _preprocess, COLUMNS

### LOAD TEST SET

In [87]:
TEST_DATA_FILE_NAME = '../data/temporal/preprocessed_test.pkl'
test_df = _preprocess(pd.read_pickle(TEST_DATA_FILE_NAME))

# TODO fix this upstream somewhere
test_df['answer_pp_toxicity'] = test_df['answer_toxicity']

# extract language from filename
test_df['lang'] = test_df['filename'].str.extract(r'batch_(..)_\d+\.csv')
test_df['lang'].unique()

array(['nl', 'en', 'es', 'tr', 'ar', 'de'], dtype=object)

In [91]:
TRAIN_DATA_FILE_NAME = '../data/temporal/preprocessed_train.pkl'
train_df = _preprocess(pd.read_pickle(TRAIN_DATA_FILE_NAME))
test_df['lang'] = test_df['filename'].str.extract(r'batch_(.*)_ChatGPT_hate_speech_or_not_\d+.csv')


0          batch_English_ChatGPT_hate_speech_or_not_14.csv
1           batch_Turkish_ChatGPT_hate_speech_or_not_3.csv
2          batch_English_ChatGPT_hate_speech_or_not_56.csv
3         batch_English_ChatGPT_hate_speech_or_not_188.csv
4         batch_English_ChatGPT_hate_speech_or_not_186.csv
                                ...                       
127467    batch_English_ChatGPT_hate_speech_or_not_173.csv
127468     batch_Spanish_ChatGPT_hate_speech_or_not_12.csv
127469      batch_Turkish_ChatGPT_hate_speech_or_not_3.csv
127470     batch_English_ChatGPT_hate_speech_or_not_87.csv
127471     batch_English_ChatGPT_hate_speech_or_not_10.csv
Name: filename, Length: 127472, dtype: object


### Select data

In [89]:
df = train_df

### Calculate statistics

In [90]:
st_counts = df.groupby('lang')[['st_id']].nunique().rename(columns={'st_id': 'Nr. sub-threads'})
st_counts

KeyError: 'lang'

In [26]:
by_st = df.groupby(['lang', 'st_id'])[['comment_id']].nunique().reset_index()  # contains nr of comments for each sub thread
by_lang = by_st.groupby('lang')[['comment_id']]                                     # grouped by language
avg_length = by_lang.mean().rename(columns={'comment_id': 'Avg. Length'})
min_length = by_lang.min().rename(columns={'comment_id': 'Min. length'})
max_length = by_lang.max().rename(columns={'comment_id': 'Max. length'})
st_length_df = pd.concat([avg_length, min_length, max_length], axis=1)
st_length_df

Unnamed: 0_level_0,Avg. Length,Min. length,Max. length
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ar,5.5,4,10
de,5.216495,4,10
en,5.098039,4,10
es,5.43956,4,10
nl,5.303797,4,9
tr,5.232558,4,10


In [46]:
# calculate counts for toxicity ('Yes/Maybe' vs. 'No'), based on majority vote
def majority_vote(series):
    modes = series.mode()
    if len(modes) == 1:
        return modes.iloc[0]
    elif len(modes) == 0:
        return None
    return "{" + ",".join(str(v) for v in modes.tolist()) + "}"

COLS = ['lang', 'comment_id', 'answer_pp_toxicity', 'answer_pp_counternarrative', 'answer_pp_justInappropriate', 'answer_pp_hasImplication', 'answer_pp_hasOther']
maj_df = df[COLS].groupby(['lang', 'comment_id']).agg(majority_vote).reset_index()

toxicity = maj_df.groupby(['lang', 'answer_pp_toxicity'])[['comment_id']].nunique().reset_index()
toxicity = toxicity.pivot(index='lang', columns='answer_pp_toxicity', values='comment_id')
toxicity['Total comments'] = toxicity['No'] + toxicity['Yes/Maybe']
toxicity = toxicity.rename(columns={'No': 'Nr. of non-toxic', 'Yes/Maybe': 'Nr. of possibly toxic comments'})
toxicity

answer_pp_toxicity,Nr. of non-toxic,Nr. of possibly toxic comments,"{No,Yes/Maybe}",Total comments
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ar,293.0,213.0,,506.0
de,246.0,260.0,,506.0
en,319.0,201.0,,520.0
es,232.0,263.0,,495.0
nl,265.0,136.0,18.0,401.0
tr,233.0,217.0,,450.0


In [81]:
impl_topic = df.groupby(['lang', 'answer_pp_implTopic'])[['comment_id']].nunique().reset_index()
impl_topic = impl_topic.pivot(index='lang', columns='answer_pp_implTopic', values='comment_id')
impl_topic

answer_pp_implTopic,(a),(a.1),(b),(b.1),(c),(d),(e)
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ar,14.0,11.0,54.0,8.0,122.0,7.0,11.0
de,20.0,11.0,64.0,4.0,96.0,2.0,9.0
en,40.0,24.0,51.0,3.0,169.0,7.0,3.0
es,45.0,29.0,118.0,26.0,141.0,3.0,20.0
nl,16.0,23.0,24.0,3.0,149.0,3.0,3.0
tr,9.0,,40.0,13.0,78.0,5.0,3.0


In [72]:
impl_polarity = df.groupby(['lang', 'answer_pp_implPolarity'])[['comment_id']].nunique().reset_index()
impl_polarity = impl_polarity.pivot(index='lang', columns='answer_pp_implPolarity', values='comment_id')
impl_polarity

answer_pp_implPolarity,Negative,Neutral,Positive
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ar,157.0,13.0,10.0
de,160.0,6.0,8.0
en,219.0,83.0,23.0
es,275.0,21.0,8.0
nl,169.0,10.0,12.0
tr,115.0,6.0,


In [74]:
MAPPING = {
    '(a)': 'Circumstances', '(a.1)': 'Harm',
    '(b)': 'Qualities', '(b.1)': 'Dehumanization',
    '(c)': 'Choices',
    '(d)': '-NonSpecific',
    '(e)': '-UnclearOrNone',
}
ILLEGAL = ['PositiveHarm', 'NeutralHarm', 'PositiveDehumanization', 'NeutralDehumanization']

def convert(row):
    topic = row['answer_pp_implTopic']
    polarity = row['answer_pp_implPolarity']
    if pd.isna(topic):
        return pd.NA
    name = str(MAPPING[topic]) if topic.endswith('.1)') else str(polarity) + str(MAPPING[topic])
    if name in ILLEGAL:
        return 'ILLEGAL'
    return name

implType = pd.concat([df[['lang', 'comment_id']], df.apply(convert, axis=1).rename('Implication Type')], axis=1)
implType = implType.groupby(['lang', 'Implication Type'])['comment_id'].nunique().reset_index()
implType = implType.pivot(index='lang', columns='Implication Type', values='comment_id')
implType

Implication Type,Dehumanization,Harm,Negative-NonSpecific,Negative-UnclearOrNone,NegativeChoices,NegativeCircumstances,NegativeQualities,Neutral-NonSpecific,Neutral-UnclearOrNone,NeutralChoices,...,NeutralQualities,None-UnclearOrNone,NoneChoices,NoneCircumstances,NoneQualities,Positive-NonSpecific,Positive-UnclearOrNone,PositiveChoices,PositiveCircumstances,PositiveQualities
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ar,8.0,11.0,2.0,8.0,116.0,12.0,51.0,1.0,3.0,6.0,...,1.0,,,,,4.0,,4.0,1.0,2.0
de,4.0,11.0,2.0,5.0,90.0,17.0,62.0,,,2.0,...,2.0,3.0,,,,,1.0,5.0,2.0,
en,3.0,24.0,2.0,,137.0,26.0,30.0,5.0,2.0,37.0,...,20.0,1.0,3.0,,,,,9.0,5.0,4.0
es,26.0,29.0,2.0,12.0,135.0,41.0,110.0,,8.0,5.0,...,5.0,,,1.0,2.0,1.0,,5.0,1.0,2.0
nl,3.0,23.0,,2.0,139.0,11.0,24.0,3.0,1.0,5.0,...,,,2.0,,,,,7.0,5.0,
tr,13.0,,5.0,3.0,76.0,9.0,37.0,,,2.0,...,4.0,,,,,,,,,


In [84]:
print([type(x) for x in [st_counts, st_length_df, toxicity, impl_topic, impl_polarity, implType]])

all_stats = pd.concat([st_counts, st_length_df, toxicity, impl_topic, impl_polarity, implType], axis=1)
all_stats = all_stats.transpose()
all_stats.to_csv('data_stats.csv')
all_stats

[<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>]


lang,ar,de,en,es,nl,tr
Nr. sub-threads,92.0,97.0,102.0,91.0,79.0,86.0
Avg. Length,5.5,5.216495,5.098039,5.43956,5.303797,5.232558
Min. length,4.0,4.0,4.0,4.0,4.0,4.0
Max. length,10.0,10.0,10.0,10.0,9.0,10.0
Nr. of non-toxic,293.0,246.0,319.0,232.0,265.0,233.0
Nr. of possibly toxic comments,213.0,260.0,201.0,263.0,136.0,217.0
"{No,Yes/Maybe}",,,,,18.0,
Total comments,506.0,506.0,520.0,495.0,401.0,450.0
(a),14.0,20.0,40.0,45.0,16.0,9.0
(a.1),11.0,11.0,24.0,29.0,23.0,
