# Select a hate speech sample 

This notebook selects posts with relevant entities that the annotators did not consider to be related to gender and sexuality > export data_selection/{dname}_fp.csv

We use the model trained on hate-speech-identities repo to find model-annotator disagreements.

## 1. Setup

In [1]:
# Version (v1.1.1) 
!git clone https://github.com/preyero/hate-speech-identities

Cloning into 'hate-speech-identities'...
remote: Enumerating objects: 306, done.[K
remote: Counting objects: 100% (306/306), done.[K
remote: Compressing objects: 100% (219/219), done.[K
remote: Total 306 (delta 129), reused 249 (delta 74), pack-reused 0[K
Receiving objects: 100% (306/306), 22.60 MiB | 12.59 MiB/s, done.
Resolving deltas: 100% (129/129), done.


```
$ bash src/get_data.sh
```

In [None]:
# Download datasets to data_selection (instructions in https://github.com/preyero/hate-speech-identities)
# and kg to hate-speech-identities/models/adaptation
!bash src/get_data.sh

## 2. Imports

Using conda environment from hate-speech-identities repo
``` 
$ conda create --name venv2 python=3.8.2
```

In [None]:
%cd hate-speech-identities

In [8]:
import os
from IPython.display import HTML

import pandas as pd

import hate_datasets as dc
import identity_group_identification as model_f

In [None]:
# paths
out_path = os.path.join('..', 'data_selection')
os.makedirs(out_path, exist_ok=True)

PROJ_DIR = os.getcwd()
print(PROJ_DIR)
model_folder = os.path.join(PROJ_DIR, 
                            'models', 
                            'hybrid/gso_soft_H256_B8_D0.05/gsso_jigsaw_gendersexualorientation_0.5-stem-hierarchical-logits')
print(model_folder)
exp_file = os.path.join(model_folder, 'exp_file.pkl')
print(exp_file)

# args
fs = ['Pred', 'IRI', 'Label', 'Def']

In [10]:
# model
identity_col = 'target_gso'
pipeline = model_f.model_load(model_folder=model_folder)

## 3. Training data

In [11]:
id_col, text_col, threshold, model_outputs = 'comment_id', 'predict_text', 0.5, sorted(model_f.IDENTITIES)
cols = [id_col, text_col, identity_col]

In [12]:
# Load data (Sachdeva et al., 2022)
d0 = model_f.load_mhs_dataset(save=False)
# ... sub-groups in hate_target.keys
comments = d0[[id_col, text_col]].drop_duplicates().sort_values(id_col)
# ...determine target identities
agreement = d0[[id_col] + model_outputs].groupby(id_col).agg('mean')
agreement = agreement[model_outputs]
d = (agreement >= threshold).astype('int').reset_index(level=0).merge(right=comments, how='left')
#print(d.columns)
print(d.shape)
d.head(2)

  importing from huggingface server


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-c32713cabe528196
Found cached dataset parquet (/Users/prl222/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

  preprocessing text
  adding group columns: gso from max(gender, sexuality) annotations.
  adding subgroup columns: transgender(trans_men/women/unspecified), othergender(non_binary/other), homosexual(gay/lesbian)
(39565, 59)


Unnamed: 0,comment_id,target_age,target_age_children,target_age_middle_aged,target_age_other,target_age_seniors,target_age_teenagers,target_age_young_adults,target_disability,target_disability_cognitive,...,target_religion_muslim,target_religion_other,target_sexuality,target_sexuality_bisexual,target_sexuality_gay,target_sexuality_homosexual,target_sexuality_lesbian,target_sexuality_other,target_sexuality_straight,predict_text
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,white people are trash.
1,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,"you've caught the big gay, it's known to be airborne via dildos."


In [13]:
# Load cross-validation predictions (KFold without shuffling, so predictions are in the same order)

test_idx, test_pred = model_f.cv_load_predictions(exp_file)
print(test_idx[:10])
print(len(test_idx))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
39565


In [14]:
identity_idx = d.columns.get_loc(identity_col)
print(f'index of {identity_col}: {identity_idx}')
print(d.shape)
d.insert(loc=identity_idx+1, column=f'{identity_col}_Pred', value=test_pred)
cols.append(f'{identity_col}_Pred')
print(d.shape)

index of target_gso: 26
(39565, 59)
(39565, 60)


In [15]:
# Get interpretations: 
_ , y_pred, interp = model_f.model_predict(pipeline, d, 'target_gso', text_col, id_col)
matches_label, _, _, definition, matches_IRI, _ = interp
d.insert(loc=identity_idx+2, column=f'{identity_col}_IRI', value=matches_IRI)
d.insert(loc=identity_idx+3, column=f'{identity_col}_Label', value=matches_label)
d.insert(loc=identity_idx+4, column=f'{identity_col}_Def', value=definition)
[cols.append(f'{identity_col}_{f}') for f in fs[1:]]
print(d.shape)

Starting predictions on: ['target_gso']
indexed 39565 documents
... feature extraction
... classification
   uni-output model
(39565, 63)


In [16]:
d.to_csv(os.path.join(out_path, 'mhs.csv'), index=False) 
d.loc[:2, cols]

Unnamed: 0,comment_id,predict_text,target_gso,target_gso_Pred,target_gso_IRI,target_gso_Label,target_gso_Def
0,1,white people are trash.,0,0.177214,http://semanticscience.org/resource/SIO_000498;http://purl.obolibrary.org/obo/GSSO_008990;http://purl.obolibrary.org/obo/GSSO_008989,person;white person;white identity,A human being.
1,2,"you've caught the big gay, it's known to be airborne via dildos.",1,0.953016,http://purl.obolibrary.org/obo/GSSO_012929;http://purl.obolibrary.org/obo/GSSO_001591;http://purl.obolibrary.org/obo/GSSO_001592;http://purl.obolibrary.org/obo/GSSO_004904;http://purl.obolibrary.org/obo/GSSO_002317;http://purl.obolibrary.org/obo/GSSO_010081;http://purl.obolibrary.org/obo/GSSO_011275;http://purl.obolibrary.org/obo/GSSO_009851;http://purl.obolibrary.org/obo/GSSO_009855;http://purl.obolibrary.org/obo/GSSO_009852;http://purl.obolibrary.org/obo/GSSO_005107;http://purl.obolibrary.org/obo/GSSO_002537,.gay;gay;heterosexual;asexual and homoromantic person;gai;A-Gay;gay identity;HIV positive;HIV unknown;HIV negative;dildo;ve,"A top-level domain name. It was proposed in ICANN's New generic top-level domain (gTLD) Program, and became available to the general public in September 2020. Top Level Design is the domain name registry for the string."
2,3,"i'm not saying you said that, i'm just saying that if you find a slur offensive (in this case the word niga) you'd find every other slur offensive. for me it's the opposite, i find none of them offensive because unless the context is bad no one should give a shit. i can call my black friend ""niga"" in a friendly way or in an evil way. there's a difference",0,0.282783,http://purl.obolibrary.org/obo/NCIT_C72884;http://purl.obolibrary.org/obo/GSSO_003205;http://semanticscience.org/resource/SIO_000114;http://purl.obolibrary.org/obo/GSSO_000493;http://purl.obolibrary.org/obo/GSSO_003761;http://purl.obolibrary.org/obo/GSSO_003225;http://purl.obolibrary.org/obo/GSSO_000553;http://purl.obolibrary.org/obo/GSSO_003240;http://purl.obolibrary.org/obo/GSSO_011746;http://purl.obolibrary.org/obo/GSSO_002462;http://purl.obolibrary.org/obo/GSSO_003755;http://purl.obolibrary.org/obo/GSSO_005526;http://purl.obolibrary.org/obo/GSSO_008988,friend;pejorative word;word;finding;other identity;saying;difference;justice;myalgic encephalomyelitis;me;offensiveness;shit;Black identity,A person other than a family member or partner whose company one enjoys and towards whom one feels affection.


## 4. External data

In [20]:
data, dnames = {}, ['xtremespeech' , 'gabhatecorpus' , 'hatexplain']

for dname in dnames:
    # Load data
    d0, text_col0, id_col0, identities_dict = dc.import_dataset(dname, o_path=f"../data_selection/{dname}.csv")
    d_out, _, text_col, id_col = dc.prepare_for_model_evaluation(d0, text_col0, id_col0, identities_dict)
    identity_idx = d_out.columns.get_loc(identity_col)
    print(f'index of {identity_col}: {identity_idx}')
    # args
    cols = [id_col, text_col, identity_col]
    print(d_out.shape)
    if dname == 'xtremespeech':
        # Include only texts in English
        d_out = d_out.loc[d_out['Language'] == 'English'].reset_index(drop=True).copy()
        print(f'... {d_out.shape[0]} english texts.')
    # ... feature extraction and classification
    _ , y_pred, interp = model_f.model_predict(pipeline, d_out, 'target_gso', text_col, id_col)
    matches_label, _, _, definition, matches_IRI, _ = interp                   
    d_out.insert(loc=identity_idx+1, column=f'{identity_col}_Pred', value=y_pred)
    # add also interpretations
    d_out.insert(loc=identity_idx+2, column=f'{identity_col}_IRI', value=matches_IRI)
    d_out.insert(loc=identity_idx+3, column=f'{identity_col}_Label', value=matches_label)
    d_out.insert(loc=identity_idx+4, column=f'{identity_col}_Def', value=definition)
    [cols.append(f'{identity_col}_{f}') for f in fs]
    print(d_out.shape)
    display(d_out.loc[:2, cols])
    data[dname] = d_out
    d_out.to_csv(os.path.join(out_path, dname+'.csv'), index=False)                            

xtremespeech imported successfully from data folder: 5063 annotations samples.
index of target_gso: 29
(5063, 30)
... 2639 english texts.
Starting predictions on: ['target_gso']
indexed 2639 documents
... feature extraction
... classification
   uni-output model
(2639, 34)


Unnamed: 0,comment_id,predict_text,target_gso,target_gso_Pred,target_gso_IRI,target_gso_Label,target_gso_Def
0,29854.0,"Kikuyus have been always comfortably helped to get power, after that they dump and forget those who helped them. They tried it on Moi but failed, it worked for them between Kibaki and Raila, they have now succeeded in abandoning Ruto. Let Raila and Ruto join hands, vanquish the kikuyus",0,0.224384,http://purl.org/sig/ont/fma/fma9712;http://purl.obolibrary.org/obo/NCIT_C74299;http://purl.obolibrary.org/obo/GSSO_009742;http://purl.bioontology.org/ontology/MESH/D011209;http://isni.org/isni/0000000121633745;http://purl.obolibrary.org/obo/GSSO_002450,hand;work;marital abandoment;power;World Health Organization;singular they,"Distal free upper limb region, each instance of which consists of some carpus, metacarpus and set of digits."
1,9100.0,"The same police teargased us when we were going to Uhuru Park they even forced miguna out of this country did you condemn them or for you it was just okey remember they even killed baby pendo, you killers",0,0.140733,http://purl.obolibrary.org/obo/GSSO_003799;http://purl.obolibrary.org/obo/GSSO_006819;http://purl.obolibrary.org/obo/NCIT_C25464;http://purl.obolibrary.org/obo/GSSO_003240;http://purl.obolibrary.org/obo/GSSO_009442;http://purl.obolibrary.org/obo/GSSO_009441;http://purl.obolibrary.org/obo/GSSO_002450,police;killing;country;justice;outing;out;singular they,"A constituted body of persons empowered by a state to enforce the law, to protect the lives, liberty and possessions of citizens, and to prevent crime and civil disorder. Their lawful powers include arrest and the legitimized use of force. The term is most commonly associated with the police forces of a sovereign state that are authorized to exercise the police power of that state within a defined legal or territorial area of responsibility. Police forces are often defined as being separate from the military and other organizations involved in the defense of the state against foreign aggressors; however, gendarmerie are military units charged with civil policing."
2,29479.0,"This is the kind of violence you Luyha guys have been celebrating since 2013, I don't know what has changed now",0,0.291301,http://purl.obolibrary.org/obo/NCIT_C93214;http://purl.obolibrary.org/obo/GSSO_003690;http://purl.obolibrary.org/obo/NCIT_C25446;http://purl.obolibrary.org/obo/GSSO_006084,violence;kindness;change;headache,An act of aggression between individuals.


gabhatecorpus imported successfully from data folder: 7813 annotations samples.
index of target_gso: 23
(7813, 24)
Starting predictions on: ['target_gso']
indexed 7813 documents
... feature extraction
... classification
   uni-output model
(7813, 28)


Unnamed: 0,comment_id,predict_text,target_gso,target_gso_Pred,target_gso_IRI,target_gso_Label,target_gso_Def
0,3,most disgusting people: - third worlders - gays - illegals - anime/marvel fans - kid fuckers,1,0.940006,http://purl.obolibrary.org/obo/GSSO_012929;http://purl.obolibrary.org/obo/GSSO_001591;http://purl.obolibrary.org/obo/GSSO_001592;http://purl.obolibrary.org/obo/GSSO_004904;http://purl.obolibrary.org/obo/GSSO_002317;http://purl.obolibrary.org/obo/GSSO_010081;http://purl.obolibrary.org/obo/GSSO_011275;http://semanticscience.org/resource/SIO_000498;http://purl.bioontology.org/ontology/STY/T008;http://semanticscience.org/resource/SIO_000828,.gay;gay;heterosexual;asexual and homoromantic person;gai;A-Gay;gay identity;person;animal;disgust,"A top-level domain name. It was proposed in ICANN's New generic top-level domain (gTLD) Program, and became available to the general public in September 2020. Top Level Design is the domain name registry for the string."
1,8,"New band names, by me: - Gay Retards on Fire - Piss Cops - Catcher in the - Jesus Christ 2 - Robocock Battlecry - Thunderlunch at the Casino - Guitar Band - The Beenles - Muslim Cops on Fire",1,0.952179,http://purl.obolibrary.org/obo/GSSO_012929;http://purl.obolibrary.org/obo/GSSO_001591;http://purl.obolibrary.org/obo/GSSO_001592;http://purl.obolibrary.org/obo/GSSO_004904;http://purl.obolibrary.org/obo/GSSO_002317;http://purl.obolibrary.org/obo/GSSO_010081;http://purl.obolibrary.org/obo/GSSO_011275;http://purl.obolibrary.org/obo/GSSO_003801;http://semanticscience.org/resource/SIO_000183;http://purl.obolibrary.org/obo/GSSO_000343;http://semanticscience.org/resource/SIO_000116;http://purl.obolibrary.org/obo/GO_0060073;http://purl.obolibrary.org/obo/GSSO_002765;http://purl.obolibrary.org/obo/GSSO_011746;http://purl.obolibrary.org/obo/GSSO_002462;http://purl.obolibrary.org/obo/GSSO_007697,.gay;gay;heterosexual;asexual and homoromantic person;gai;A-Gay;gay identity;police officer;personal name;bottom;name;urination;North American English;myalgic encephalomyelitis;me;Muslim,"A top-level domain name. It was proposed in ICANN's New generic top-level domain (gTLD) Program, and became available to the general public in September 2020. Top Level Design is the domain name registry for the string."
2,9,'Is fucking a retarded girl rape? Asking for a friend.' - Barrack 'Sadam Hussein' Obama,0,0.826696,http://purl.obolibrary.org/obo/GSSO_000369;http://purl.obolibrary.org/obo/NCIT_C72884;http://purl.obolibrary.org/obo/GSSO_003041;http://purl.obolibrary.org/obo/GSSO_004152,woman;friend;rape;fuck,"A person whose identity is female, based on societal and cultural conceptualizations of being female, usually (but not always) reflected by specific anatomical variations, chromosome combinations, and/or sex hormones."


hatexplain imported successfully from data folder: 12334 annotations samples.
index of target_gso: 31
(12334, 32)
Starting predictions on: ['target_gso']
indexed 12334 documents
... feature extraction
... classification
   uni-output model
(12334, 36)


Unnamed: 0,comment_id,predict_text,target_gso,target_gso_Pred,target_gso_IRI,target_gso_Label,target_gso_Def
0,1179088797964763136_twitter,<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔,0,0.367159,http://purl.obolibrary.org/obo/NCIT_C164662;http://purl.obolibrary.org/obo/GSSO_009824;http://purl.obolibrary.org/obo/NCIT_C95018;http://purl.obolibrary.org/obo/GSSO_002431;http://purl.bioontology.org/ontology/MESH/D013850;http://purl.obolibrary.org/obo/GSSO_002551;http://purl.obolibrary.org/obo/GSSO_002584;https://web.archive.org/web/20191009000629/https://twitter.com,user;accessibility;use;he;thinking;s/he;hes;Twitter,A person who makes use of something.
1,1179085312976445440_twitter,this bitch in whataburger eating a burger with no top bun and holding it palm up i hate white bitches,1,0.446496,http://purl.obolibrary.org/obo/GSSO_000346;http://semanticscience.org/resource/SIO_000411;http://purl.obolibrary.org/obo/GSSO_003203;http://purl.obolibrary.org/obo/GSSO_008989,top;hate;bitch;white identity,"Someone, typically a man, who is penetrating someone else during sexual intercourse or someone who has a preference for penetrating during such intercourse."
2,13851720_gab,laura loomer raped me while screaming at me in her disgusting kike language and said we must exterminate the goyim laura loomer loomergate,0,0.574225,http://purl.obolibrary.org/obo/GSSO_002452;http://purl.obolibrary.org/obo/GSSO_002437;http://purl.obolibrary.org/obo/GSSO_003041;http://semanticscience.org/resource/SIO_000828;http://purl.bioontology.org/ontology/MESH/D007802;http://purl.obolibrary.org/obo/GSSO_011746;http://purl.obolibrary.org/obo/GSSO_002906;http://purl.obolibrary.org/obo/GSSO_002462;http://purl.obolibrary.org/obo/GSSO_002722,hers;her;rape;disgust;language;myalgic encephalomyelitis;linguicism;me;her'n,[]


In [21]:
data['mhs'] = d

# Print counts of FPs on each dataset and export whole and subset data to models/interpretations/annotation_study
# for the identity col
annotation_analysis = {}
for dname, d in data.items():
    fp_sample = d.loc[(d[identity_col]<0.5)&(d[f'{identity_col}_Pred']>=0.5)]
    ni, N = fp_sample.shape[0], d.shape[0]
    annotation_analysis[dname] = {'N':ni, '%':round(ni*100/N, 2)}
    fp_sample.to_csv(os.path.join(out_path, dname+'_fp.csv'), index=False)
annotation_analysis = pd.DataFrame.from_dict(annotation_analysis, orient='index')
annotation_analysis

Unnamed: 0,N,%
xtremespeech,182,6.9
gabhatecorpus,478,6.12
hatexplain,1303,10.56
mhs,1789,4.52
