# Entity recognition with KG (GSSO)

This notebook samples messages to get a diverse vocabulary, taking from repeated most important features the ones with the highest probability > export data_sampling/sampling.json

We obtain 350 unique posts from 4 hate speech databases. 

In [1]:
import os, json
import pandas as pd
import numpy as np
from typing import List

out_path = os.path.join('data_sampling')
os.makedirs(out_path, exist_ok=True)

# FP sample with contradictory model predictions and aggregated annotations: 1789 comments
d = pd.read_csv(os.path.join('data_selection', 'mhs_fp.csv'))
text_col, id_col = 'predict_text', 'comment_id'
print(d.shape)
d.head(2)

(1789, 63)


Unnamed: 0,comment_id,target_age,target_age_children,target_age_middle_aged,target_age_other,target_age_seniors,target_age_teenagers,target_age_young_adults,target_disability,target_disability_cognitive,...,target_religion_muslim,target_religion_other,target_sexuality,target_sexuality_bisexual,target_sexuality_gay,target_sexuality_homosexual,target_sexuality_lesbian,target_sexuality_other,target_sexuality_straight,predict_text
0,29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,i'm absolutely for equal rights for all people...
1,86,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,jesus christ why do niggers always have to sho...


## Training data

In [8]:
# Step by step on training data
print(d.loc[:5,'target_gso_Def'])
d['categories'] = d['target_gso_Def'].astype('category').cat.codes
y = d['categories']
print(y[:6])
# Because, e.g., there are 695 examples with female, 432 with male

0    Identification as male/masculine, female/femin...
1    A person whose identity is male, based on soci...
2    A person whose identity is male, based on soci...
3    A person whose identity is female, based on so...
4    A person whose identity is male, based on soci...
5    Typically, someone with a female gender identi...
Name: target_gso_Def, dtype: object
0     96
1     42
2     42
3     41
4     42
5    127
Name: categories, dtype: int16


In [9]:
# Number of all different entities found, and the ones with more than 2 occurrences
y_counts = y.value_counts().to_dict()
print(len(y_counts))
print('entities more than twice:', len([k for k,v in y_counts.items() if v > 2]))
print('twice or less:', len([k for k,v in y_counts.items() if v <= 2]))
print(y_counts)

130
entities more than twice: 36
twice or less: 94
{41: 695, 42: 432, 129: 118, 111: 70, 62: 34, 124: 34, 110: 31, 94: 26, 25: 26, 83: 25, 93: 22, 96: 22, 24: 15, 26: 12, 123: 11, 101: 10, 4: 10, 21: 10, 44: 8, 80: 7, 56: 5, 75: 5, 0: 4, 95: 4, 45: 4, 71: 4, 107: 4, 76: 4, 10: 3, 114: 3, 103: 3, 39: 3, 31: 3, 52: 3, 28: 3, 53: 3, 63: 2, 65: 2, 9: 2, 109: 2, 48: 2, 108: 2, 15: 2, 36: 2, 91: 2, 74: 2, 72: 2, 77: 2, 33: 2, 29: 2, 1: 2, 70: 2, 5: 2, 117: 2, 87: 2, 102: 1, 50: 1, 6: 1, 82: 1, 92: 1, 19: 1, 40: 1, 126: 1, 54: 1, 8: 1, 113: 1, 81: 1, 13: 1, 106: 1, 22: 1, 34: 1, 7: 1, 11: 1, 47: 1, 122: 1, 97: 1, 60: 1, 84: 1, 59: 1, 35: 1, 68: 1, 23: 1, 17: 1, 99: 1, 104: 1, 116: 1, 112: 1, 120: 1, 121: 1, 86: 1, 98: 1, 85: 1, 58: 1, 20: 1, 46: 1, 27: 1, 90: 1, 128: 1, 38: 1, 37: 1, 119: 1, 64: 1, 105: 1, 32: 1, 69: 1, 67: 1, 66: 1, 127: 1, 2: 1, 16: 1, 88: 1, 14: 1, 49: 1, 18: 1, 55: 1, 73: 1, 118: 1, 115: 1, 57: 1, 125: 1, 43: 1, 61: 1, 51: 1, 12: 1, 89: 1, 30: 1, 3: 1, 78: 1, 79: 1, 100: 

In [10]:
# which categories (highest important entities) are the most frequqent in the error sample?
y_entities = {}
for pos, counts in y_counts.items():
    d_pos = d[(y == pos).values.tolist()].reset_index()
    y_entities[f'{pos}: {counts}'] = d_pos.loc[0,'target_gso_Label'].split(';')[0]
print(y_entities)

{'41: 695': 'woman', '42: 432': 'man', '129: 118': 'hers', '111: 70': 'pussy', '62: 34': '.gay', '124: 34': 'sex', '110: 31': 'heterosexual', '94: 26': 'male gender identity', '25: 26': 'bitch', '83: 25': 'sucking', '93: 22': 'female gender identity', '96: 22': 'gender', '24: 15': 'marital partner', '26: 12': 'faggot', '123: 11': 'sexuality', '101: 10': 'semen', '4: 10': 'parent', '21: 10': 'mixed-orientation marriage', '44: 8': 'homophobia', '80: 7': 'body', '56: 5': 'face', '75: 5': 'LGBT', '0: 4': 'das', '95: 4': 'bisexual', '45: 4': 'dyadic relationship', '71: 4': 'homosexualism', '107: 4': 'gay', '76: 4': 'LGBTQ', '10: 3': 'marriage', '114: 3': 'interpersonal attraction', '103: 3': 'mouth', '39: 3': 'sibling', '31: 3': 'friend', '52: 3': 'partner', '28: 3': 'muxe', '53: 3': 'HIV positive', '63: 2': 'instant messaging', '65: 2': 'word', '9: 2': 'clothing', '109: 2': 'anal intercourse', '48: 2': 'community', '108: 2': 'sexual intercourse', '15: 2': 'family', '36: 2': 'single person'

In [11]:
terms = []
for pos, counts in y_counts.items():
    d_pos = d[(y == pos).values.tolist()].reset_index()
    terms.append(d_pos.loc[0,'target_gso_Label'].split(';')[0])
print(terms)

['woman', 'man', 'hers', 'pussy', '.gay', 'sex', 'heterosexual', 'male gender identity', 'bitch', 'sucking', 'female gender identity', 'gender', 'marital partner', 'faggot', 'sexuality', 'semen', 'parent', 'mixed-orientation marriage', 'homophobia', 'body', 'face', 'LGBT', 'das', 'bisexual', 'dyadic relationship', 'homosexualism', 'gay', 'LGBTQ', 'marriage', 'interpersonal attraction', 'mouth', 'sibling', 'friend', 'partner', 'muxe', 'HIV positive', 'instant messaging', 'word', 'clothing', 'anal intercourse', 'community', 'sexual intercourse', 'family', 'single person', 'flatulence', 'rapist', 'personal name', 'life', 'immigrant', 'artist', 't!un', 'hole', 'movement', 'pornography', 'death', 'anus', 'affair', 'cousin', 'sport', 'hate', 'list', 'advocate', 'murder', 'anger', 'Italy', 'intellectual disability', 'money', 'file', 'day', 'number', 'sodomite', 'brown', 'fag', 'sentence', 'duke', 'identity', 'age', 'beating', 'symbol', 'prisoner', 'closeted', 'grandparent', 'nail', 'masculism

In [12]:
# Use this diverse set of examples to generate a labeling task: with text, definition, and list of entities.
ids = []

# ... all unique occurrences
print('comment ids from all unique entities')
unique = [k for k,v in y_counts.items() if v<2]
print(len(unique))
print(unique)
ids+=d[(y.isin(unique))][id_col].to_list()
print(len(ids))

# ... highest probability in repeated entities
print('sampling entities with more than one occurrence')
print(d.shape)
d_sampling = d[~(y.isin(unique))]
print(d.shape[0]-len(ids))
print(d_sampling.shape)

print('number of classes corresp to repeated entities', 130-75)
y_classes = d_sampling['target_gso_Def'].astype('category').cat.codes
print(len(y_classes.value_counts()))

max_indices = d_sampling.groupby('categories')['target_gso_Pred'].idxmax()
print(len(max_indices))
ids+=d_sampling.loc[d_sampling.index.isin(max_indices), id_col].tolist()
print('resulting sampling ids', len(ids))
d_sampling.loc[d_sampling.index.isin(max_indices), [id_col, 'categories'] + [f'target_gso_{p}' for p in ['Label', 'Pred']]]

comment ids from all unique entities
75
[102, 50, 6, 82, 92, 19, 40, 126, 54, 8, 113, 81, 13, 106, 22, 34, 7, 11, 47, 122, 97, 60, 84, 59, 35, 68, 23, 17, 99, 104, 116, 112, 120, 121, 86, 98, 85, 58, 20, 46, 27, 90, 128, 38, 37, 119, 64, 105, 32, 69, 67, 66, 127, 2, 16, 88, 14, 49, 18, 55, 73, 118, 115, 57, 125, 43, 61, 51, 12, 89, 30, 3, 78, 79, 100]
75
sampling entities with more than one occurrence
(1789, 64)
1714
(1714, 64)
number of classes corresp to repeated entities 55
55
55
resulting sampling ids 130


Unnamed: 0,comment_id,categories,target_gso_Label,target_gso_Pred
22,888,70,hole;bitch;fuck,0.557834
154,6425,75,LGBT;.lgbt;r/lgbt;queer sexual orientation;per...,0.927318
323,12497,41,woman;.gay;gay;heterosexual;asexual and homoro...,0.991958
355,13570,44,homophobia;mixed-orientation marriage;quality;...,0.563604
398,15066,42,man;.gay;gay;heterosexual;asexual and homoroma...,0.973066
480,18174,107,gay;gay person;homosexuality;human homosexuali...,0.888273
488,18453,71,homosexualism;sex;sexual intercourse;premarita...,0.567386
494,18643,4,parent;semen;seminal emission;dwelling;licking...,0.604195
547,20522,103,mouth;foot;hers;her;she;narrative;terrorism;to...,0.644858
564,20952,31,friend;life;pussy;life cycle;artificial life;d...,0.662942


In [13]:
# we take max from each type: e.g. (compare category with table above)
d.loc[d.categories==5, ['categories'] + [f'target_gso_{p}' for p in ['Label', 'Pred']]]

Unnamed: 0,categories,target_gso_Label,target_gso_Pred
568,5,movement;bitch;myalgic encephalomyelitis;me;fu...,0.503954
1777,5,movement;bitch;myalgic encephalomyelitis;me,0.519281


In [14]:
# entities which we have sampled and their id
example_entities = {}
for example_id in d_sampling.loc[d_sampling.index.isin(max_indices), id_col].tolist():
    example_entities[example_id] = d.loc[d[id_col]==example_id,'target_gso_Label'].to_list()[0].split(';')[0]
print(example_entities)

{888: 'hole', 6425: 'LGBT', 12497: 'woman', 13570: 'homophobia', 15066: 'man', 18174: 'gay', 18453: 'homosexualism', 18643: 'parent', 20522: 'mouth', 20952: 'friend', 21431: 'semen', 22115: 'life', 22145: 'marital partner', 22764: 'gender', 23572: '.gay', 23961: 'sexuality', 24928: 'anal intercourse', 24968: 'interpersonal attraction', 25616: 'body', 26413: 'heterosexual', 26720: 'face', 27022: 'pussy', 27941: 'rapist', 28485: 'sex', 30388: 'LGBTQ', 31220: 'male gender identity', 32077: 'muxe', 32925: 'clothing', 33276: 'marriage', 34548: 't!un', 39114: 'pornography', 39594: 'HIV positive', 39900: 'word', 40398: 'personal name', 40906: 'bitch', 41735: 'community', 41737: 'single person', 42143: 'hers', 42315: 'faggot', 42363: 'female gender identity', 42814: 'sexual intercourse', 43080: 'dyadic relationship', 43420: 'partner', 43491: 'instant messaging', 43987: 'immigrant', 44278: 'family', 44627: 'mixed-orientation marriage', 45777: 'sibling', 47124: 'flatulence', 47143: 'das', 47428:

In [15]:
# Export json: path to get their texts and predictions, ids for sample
sampling = {}
sampling['mhs'] = {'path': os.path.join('data_selection', 'mhs_fp.csv'), 'ids': ids}


### Same in OOD

In [16]:
# Same with xtremespeech, gabhate and hatexplain
def draw_diverse_sample(df, 
                        col_categories = 'target_gso_Def', 
                        col_prob = 'target_gso_Pred', 
                        col_id=id_col):
    sample_ids = []
    # Encode by high relevant entities
    df['entity_id'] = df[col_categories].astype('category').cat.codes
    
    # Include ids from unique occurrences
    y = df['entity_id']
    y_counts = y.value_counts().to_dict()
    print('ocurring more than twice:', len([k for k,v in y_counts.items() if v > 2]))
    print('once or twice:', len([k for k,v in y_counts.items() if v <= 2]))

    # Include ids from unique occurrences
    unique = [k for k,v in y_counts.items() if v<2]
    sample_ids+=df[y.isin(unique)][col_id].to_list()

    # Include ids from repeated entities with highest probability
    df_repeated = df[~(y.isin(unique))]
    max_indices = df_repeated.groupby('entity_id')[col_prob].idxmax()
    sample_ids+=df_repeated.loc[df_repeated.index.isin(max_indices), col_id].tolist()
    
    print(f'({len(unique)}) unique + ({len(max_indices)}) repeated = ({len(y_counts)}) entities')
    print(len(sample_ids))
    return sample_ids


In [17]:
text_col, id_col = 'predict_text', 'comment_id'
dnames = ['mhs', 'gabhatecorpus', 'hatexplain', 'xtremespeech']

sampling = {}

for dname in dnames:
    print(dname.upper())
    path_i = os.path.join('data_selection', f'{dname}_fp.csv')
    d_i = pd.read_csv(path_i)
    print(d_i.shape)
    print(d_i.columns)

    ids_i = draw_diverse_sample(d_i)
    sampling[dname] = {'path': path_i, 'ids': ids_i}
    


MHS
(1789, 63)
Index(['comment_id', 'target_age', 'target_age_children',
       'target_age_middle_aged', 'target_age_other', 'target_age_seniors',
       'target_age_teenagers', 'target_age_young_adults', 'target_disability',
       'target_disability_cognitive', 'target_disability_hearing_impaired',
       'target_disability_neurological', 'target_disability_other',
       'target_disability_physical', 'target_disability_unspecific',
       'target_disability_visually_impaired', 'target_gender',
       'target_gender_men', 'target_gender_non_binary', 'target_gender_other',
       'target_gender_othergender', 'target_gender_transgender',
       'target_gender_transgender_men',
       'target_gender_transgender_unspecified',
       'target_gender_transgender_women', 'target_gender_women', 'target_gso',
       'target_gso_Pred', 'target_gso_IRI', 'target_gso_Label',
       'target_gso_Def', 'target_origin', 'target_origin_immigrant',
       'target_origin_migrant_worker', 'target_origin

In [18]:
# Counts from each dataset
130+84+118+18

350

In [19]:
# export sampled ids
with open(os.path.join(out_path, 'sampling.json'), 'w') as file:
    json.dump(sampling, file)