In [1]:
import os
os.chdir('..') 
print(os.getcwd())
import pandas as pd
import hate_datasets as dc
import kg_adaptation as kg_adapt
import identity_group_identification as model_training
from baselines.target_classification.hate_target import keys

# Data imported as in pre-training and training 
datasets, d_names = [], ['jigsaw', 'jigsaw_train', 'mhc', 'ghc', 'xspeech','hX']

/Users/prl222/OneDrive - The Open University/Projects/hate-speech-identities


2023-03-24 10:14:30.492568: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Import data

In [2]:
# KG Adaptation

# ... jigsaw
identities = ['gender', 'sexual_orientation']
d, text_col, id_col, g_labels = dc.import_dataset('jigsaw')
d_subset, d_other = kg_adapt.adaptation_subset(d, g_labels, 'jigsaw', 0.5, identities)
datasets.append(pd.concat(objs=[d_subset, d_other]))
# ... jigsaw sample ('./data/jigsaw_0.5_gendersexualorientation.csv')
datasets.append(d_subset)

jigsaw imported successfully from data folder: 448000 annotations samples.
Sampling distribution with thr=0.5 for gendersexualorientation
  260337/448000 samples with no identity annotations under 0.5
  min gender or sexual_orientation sample: 12713
  25426 unique positive samples 
2*n (12713) = 25426 - 0 duplicates
  -- gender_0.5: 16850
  -- sexual_orientation_0.5: 12713
  25426 unique negative samples:
  -- religion_0.5: 7862
  -- race_0.5: 7006
  -- disability_0.5: 4741
  -- none_0.5: 6895
 50852 unique train samples: 2*n (25426) = 50852 - 0 duplicates:
  -- gendersexualorientation: 
1    25426
0    25426
Name: gendersexualorientation, dtype: int64
  Pre-training corpus exported to ./data: jigsaw_0.5_gendersexualorientation


In [3]:
# Hybrid Model

# measuring hate corpus: same preparation as in model training
data = model_training.load_mhs_dataset(save=False)
id_col, text_col, threshold, model_outputs = 'comment_id', 'predict_text', 0.5, sorted(keys.target_groups)
# ... sub-groups in hate_target.keys
comments = data[[id_col, text_col]].drop_duplicates().sort_values(id_col)
# Determine target identities
agreement = data[[id_col] + model_outputs].groupby(id_col).agg('mean')
agreement = agreement[model_outputs]
is_target = (agreement >= threshold).astype('int').reset_index(level=0).merge(right=comments, how='left')
datasets.append(is_target)
# .... validation data
for dname in ['gabhatecorpus', 'xtremespeech', 'hatexplain']:
    df, text_col0, id_col0, identities_dict = dc.import_dataset(dname)
    data, target_cols, text_col, id_col = dc.prepare_for_model_evaluation(
        df, text_col0, id_col0, identities_dict
    )
    if dname == 'xtremespeech':
        # Include only texts in English
        data = data.loc[data['Language'] == 'English']
        print(f'... {data.shape[0]} english texts.')

    datasets.append(data)

  importing from huggingface server


Found cached dataset parquet (/Users/prl222/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 10.20it/s]


  preprocessing text
  adding gso column from max(gender, sexuality) annotations.
  exported to: ./models/measuring-hate-speech.csv
gabhatecorpus imported successfully from data folder: 7813 annotations samples.
xtremespeech imported successfully from data folder: 5063 annotations samples.
... 2639 english texts.
hatexplain imported successfully from data folder: 12334 annotations samples.


## Get statistics

Number and (%) of texts related to each identity group (Table 1)

In [4]:
# Compare task labels and disaggregated by group.

# Create table stats.
identities = ['gender', 'sexual', 'sexuality', 'gendersexualorientation', 'religion', 'race', 'disability', 'age', 
              'origin', 'politics', 'economic', 'other', 'miscellaneous']

d_names_counts = ['jigsaw', 'jigsaw_train', 'mhc', 'ghc', 'xspeech','hX']

In [5]:
# Print identity annotations to create columns list for identity counts
for i, d in enumerate(datasets):
    dname = d_names[i]
    if dname in d_names_counts:
        print(dname.upper())
        print(d.columns)
        identity_cols = []
        for identity in identities:
            identity_cols += [col for col in d.columns if identity in col]
        print(identity_cols)
        print('\n\n')




JIGSAW
Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count', 'gender', 'gender_list',
       'sexual_orientation', 'sexual_orientation_list', 'religion',
       'religion_list', 'race', 'race_list', 'disability', 'disability_list',
       'gender_0.5', 'sexual_orientation_0.5

In [7]:
identities_counts = {'jigsaw': ['gender_0.5', 'sexual_orientation_0.5', 'gendersexualorientation', 'religion_0.5', 'race_0.5', 'disability_0.5'], 
                     'jigsaw_train': ['gender_0.5', 'sexual_orientation_0.5', 'gendersexualorientation','religion_0.5', 'race_0.5', 'disability_0.5'], 
                     # Aggregation by model outputs
                     'mhc': ['target_gender', 'target_sexuality', 'target_gso', 'target_religion', 'target_race', 'target_disability', 'target_age', 'target_origin'], 
                     'ghc': ['target_gender', 'target_sexuality', 'target_gso', 'target_religion', 'target_race', 'target_disability', 'target_origin', 'politics'], 
                     'xspeech': ['target_gender', 'target_sexuality', 'target_gso', 'target_religion', 'target_race', 'economic_status', 'other'], 
                     'hX': ['target_gender', 'target_sexuality', 'target_gso','target_religion', 'target_race', 'target_disability', 'target_origin', 'economic_status', 'miscellaneous']}

d_counts = {}
for i, d in enumerate(datasets):
    dname = d_names[i]
    if dname in d_names_counts:
        print('\n\n')
        print(dname.upper())
        # Total: N(%)
        N = d.shape[0]
        di_counts = {'N': N}
        # Identities: N(%)
        for col in identities_counts[dname]:
            ni = d.loc[d[col]>=0.5].shape[0]
            pi = round(ni*100/N, 2)
            di_counts[col] = f'{ni}({pi})'
        print(pd.DataFrame.from_dict(di_counts, orient='index'))
        d_counts[dname] = di_counts




JIGSAW
                                    0
N                              448000
gender_0.5               88790(19.82)
sexual_orientation_0.5    12713(2.84)
gendersexualorientation   25426(5.68)
religion_0.5             70149(15.66)
race_0.5                  42906(9.58)
disability_0.5             5559(1.24)



JIGSAW_TRAIN
                                    0
N                               50852
gender_0.5               16850(33.14)
sexual_orientation_0.5    12713(25.0)
gendersexualorientation   25426(50.0)
religion_0.5             12683(24.94)
race_0.5                  9674(19.02)
disability_0.5             4918(9.67)



MHC
                              0
N                         39565
target_gender      14825(37.47)
target_sexuality    7719(19.51)
target_gso         20014(50.59)
target_religion     6578(16.63)
target_race        12635(31.93)
target_disability    1120(2.83)
target_age           1051(2.66)
target_origin       7744(19.57)



GHC
                             0
N