In [42]:
import pandas as pd
import os
import json
from dfply import *
from itertools import product
# from aif360.metrics import DatasetMetric
# from aif360.datasets import StructuredDataset
pd.set_option('display.max_columns', 500)
from fairlearn.metrics import demographic_parity_difference
from preprocess import clean_text

# Convert dataset to AIF360 readable dataset

In [3]:
jigsaw = pd.read_csv('jigsaw_data_w_racial_descriptors.csv')

In [5]:
jigsaw['clean_comment'] = jigsaw['comment_text'].apply(clean_text)

In [4]:
jigsaw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
black,33710.0,0.3747849,0.4840745,0.0,0.0,0.0,1.0,1.0
white,33710.0,0.6529516,0.4760384,0.0,0.0,1.0,1.0,1.0
asian,33710.0,0.07389499,0.2616038,0.0,0.0,0.0,0.0,1.0
latino,33710.0,0.02927915,0.1685904,0.0,0.0,0.0,0.0,1.0
other_race_or_ethnicity,33710.0,0.000978938,0.03127313,0.0,0.0,0.0,0.0,1.0
threat,33710.0,0.01983105,0.06425148,0.0,0.0,0.0,0.0,0.9232409
insult,33710.0,0.1846851,0.2078,0.0,0.0,0.1111111,0.3,1.0
identity_attack,33710.0,0.2473804,0.2145312,0.0,0.0,0.2,0.4,1.0
sexual_explicit,33710.0,0.009536097,0.04962459,0.0,0.0,0.0,0.0,0.9271137
toxicity,33710.0,0.3051161,0.2511818,0.0,0.0,0.3,0.5,1.0


In [15]:
protected_attribute_names = ['black', 'white', 'latino', 'other_race_or_ethnicity', 'asian']
label_names = ['threat', 'insult', 'identity_attack', 'sexual_explicit', 'toxicity', 'obscene', 'severe_toxicity']

In [17]:
jigsaw.isna().sum()

black                           0
white                           0
asian                           0
latino                          0
other_race_or_ethnicity         0
threat                          0
insult                          0
identity_attack                 0
sexual_explicit                 0
toxicity                        0
obscene                         0
severe_toxicity                 0
identity_annotator_count        0
toxicity_annotator_count        0
funny                           0
wow                             0
sad                             0
likes                           0
disagree                        0
id                              0
publication_id                  0
parent_id                   13501
article_id                      0
comment_text                    0
split                           0
created_date                    0
rating                          0
dtype: int64

In [18]:
# Remove parent_id as it's not relevant
jigsaw.drop('parent_id', axis=1, inplace=True)

In [20]:
jigsaw_aif = StructuredDataset(jigsaw.select_dtypes(np.number), label_names=label_names, protected_attribute_names=protected_attribute_names)

In [27]:
# Define the races and their possible values
possible_values = [0, 1]

# Generate all combinations using itertools.product
combinations = list(product(possible_values, repeat=len(races)))

# Create a list of dictionaries representing each combination
result = [dict(zip(protected_attribute_names, combo)) for combo in combinations]

# Print the result
print(result)

[{'black': 0, 'white': 0, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 0}, {'black': 0, 'white': 0, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 1}, {'black': 0, 'white': 0, 'latino': 0, 'other_race_or_ethnicity': 1, 'asian': 0}, {'black': 0, 'white': 0, 'latino': 0, 'other_race_or_ethnicity': 1, 'asian': 1}, {'black': 0, 'white': 0, 'latino': 1, 'other_race_or_ethnicity': 0, 'asian': 0}, {'black': 0, 'white': 0, 'latino': 1, 'other_race_or_ethnicity': 0, 'asian': 1}, {'black': 0, 'white': 0, 'latino': 1, 'other_race_or_ethnicity': 1, 'asian': 0}, {'black': 0, 'white': 0, 'latino': 1, 'other_race_or_ethnicity': 1, 'asian': 1}, {'black': 0, 'white': 1, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 0}, {'black': 0, 'white': 1, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 1}, {'black': 0, 'white': 1, 'latino': 0, 'other_race_or_ethnicity': 1, 'asian': 0}, {'black': 0, 'white': 1, 'latino': 0, 'other_race_or_ethnicity': 1, 'asian': 1}, {'black': 0, 'white': 1, 'l

In [28]:
p = [{'black': 0, 'white': 1, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 0}]

result.remove({'black': 0, 'white': 0, 'latino': 0, 'other_race_or_ethnicity': 0, 'asian': 0})
result.remove(p[0])

u = result.copy()

# Get word embeddings with Word2Vec

In [25]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize  # or any other tokenizer

# Example: assuming `documents` is a list of preprocessed documents
tokenized_documents = [word_tokenize(doc.lower()) for doc in jigsaw.comment_text.to_list()]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

In [27]:
def document_embedding(doc_tokens, model):
    # Filter tokens that have word vectors in the model
    valid_tokens = [token for token in doc_tokens if token in model.wv.key_to_index]

    if not valid_tokens:
        return None

    # Average the word vectors
    doc_embedding = model.wv[valid_tokens].mean(axis=0)
    return doc_embedding

In [28]:
document_embeddings = [document_embedding(tokens, model) for tokens in tokenized_documents]

In [29]:
# Example: Checking gender bias
gender_analogy = model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

In [30]:
gender_analogy

[('martin', 0.8641698956489563),
 ('luther', 0.8388773202896118),
 ('robert', 0.8063703775405884),
 ('jr.', 0.7955994606018066),
 ('joe', 0.7945797443389893),
 ('byrd', 0.7938011884689331),
 ('dr.', 0.7926991581916809),
 ('scott', 0.7764047980308533),
 ('e.', 0.7758944034576416),
 ('senator', 0.7704107761383057)]

# Embeddings bias detection

In [43]:
# read the json 
folder_path = 'C:/Users/sabin/Downloads/freelancing/permutable.ai/01_handover/00_projects/08_marketing/hackathon/Dataset Challenge #4/ResponsibleNLP-main/ResponsibleNLP-main/holistic_bias/dataset/v1.1'

fairness = {}

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as file:
            data_dict = json.load(file)
            dict_name = os.path.splitext(filename)[0]
            fairness[dict_name] = data_dict

del fairness["sentence_templates"]

bias_tags = list(set(list(fairness['descriptors'].keys())+list(fairness['standalone_noun_phrases'].keys())+list(fairness['nouns'].keys())))
bias_df = pd.DataFrame(data=None, columns=bias_tags)

In [68]:
fairness['descriptors']

{'ability': {'auditory': [{'descriptor': 'Deaf', 'preference': 'reviewed'},
   {'descriptor': 'hard-of-hearing', 'preference': 'reviewed'},
   {'descriptor': 'hearing-impaired', 'preference': 'dispreferred'}],
  'intellectual_and_developmental': [{'descriptor': 'learning-disabled',
    'preference': 'dispreferred'},
   {'descriptor': 'retarded', 'preference': 'dispreferred'},
   {'descriptor': 'slow', 'preference': 'dispreferred'}],
  'mobility': [{'descriptor': 'crippled', 'preference': 'dispreferred'},
   {'descriptor': 'paraplegic', 'preference': 'dispreferred'},
   {'descriptor': 'paralyzed', 'preference': 'dispreferred'},
   {'descriptor': 'quadriplegic', 'preference': 'dispreferred'},
   {'descriptor': 'wheelchair-bound', 'preference': 'dispreferred'}],
  'neurological': [{'descriptor': 'autistic', 'preference': 'reviewed'},
   {'descriptor': 'dyslexic', 'preference': 'dispreferred'},
   {'descriptor': 'neuroatypical', 'preference': 'dispreferred'},
   {'descriptor': 'neurodiverg

In [57]:
pattern = re.compile(r'\b(?:a|an)\b', flags=re.IGNORECASE)

nationality = fairness['descriptors']['nationality']
nationality = [entry['descriptor'] for descriptors_list in nationality.values() for entry in descriptors_list if 'descriptor' in entry]

race_ethnicity = fairness['descriptors']['race_ethnicity']
race_ethnicity_snp = fairness['standalone_noun_phrases']['race_ethnicity']
race_ethnicity_1 = [entry['descriptor'] for descriptors_list in race_ethnicity.values() for entry in descriptors_list if 'descriptor' in entry]
race_ethnicity_2 = [re.sub(pattern, '', value).strip() for entry in race_ethnicity_snp for value in entry.values()]
race_ethnicity = race_ethnicity_1 + race_ethnicity_2

religion = fairness['descriptors']['religion']
religion_snp = fairness['standalone_noun_phrases']['religion']
religion = ['Ashkenazi Jewish', "Bahá'í", 'Buddhist','Cheondoist', 'Confucianist', 'Conservative Jewish', 'Druze',
        'Hasidic', 'Hindu', 'Jain', 'Jewish', 'Muslim', 'Orthodox Jewish', 'Rasta', 'Rastafari', 'Rastafarian', 'Reform Jewish',
            'Sephardic Jewish', 'Shia', 'Shintoist', 'Sikh', 'Sunni', 'Taoist', 'Zoroastrian', 'Jewish American Princess',
                'Jewish American Princesses', 'JAP', 'JAPs']

hair = ['dreadlocked', 'curly-haired', 'frizzy-haired', 'coily-haired', 'afro', 'afros', 'jewfro', 'jewfros']
skin_color = ['brown-skinned', 'dark-skinned', 'olive-skinned', 'yellow']
immigration = ['asylum seeker', 'asylum seekers', 'refugee', 'refugees', 'immigrant', 'immigrants', 'DACA', 'Dreamer', 'Dreamers']

racial_indicators = nationality + race_ethnicity + religion + hair + skin_color + immigration
racial_indicators = [indicator.lower() for indicator in racial_indicators]

In [72]:
race_ethnicity_2

['person of color',
 'people of color',
 'reviewed',
 'man of color',
 'men of color',
 'woman of color',
 'women of color']

In [70]:
race_ethnicity_1 = {}
for race_class, race_info in race_ethnicity.items():
    race_ethnicity_1[race_class] = []
    for detail in race_info:
        if isinstance(detail, str):
            race_ethnicity_1[race_class].append(detail)
        else:
            race_ethnicity_1[race_class].append(detail['descriptor'])

In [73]:
race_ethnicity_2.remove('reviewed')
race_ethnicity_1['combined'].extend(race_ethnicity_2)
race_ethnicity_1

{'alaska_native': ['Alaska Native'],
 'asian': ['Asian',
  'Asian-American',
  'Desi',
  'East Asian',
  'Oriental',
  'South Asian',
  'Southeast Asian'],
 'black': ['African', 'African-American', 'Black'],
 'latinx': ['Hispanic',
  'Latin',
  'Latinx',
  'Latine',
  'Latina',
  'Latino',
  'Latin American'],
 'indigenous': ['Aboriginal',
  'Indigenous',
  'Native American',
  'American Indian'],
 'native_hawaiian': ['Native Hawaiian'],
 'pacific_islander': ['Pacific Islander'],
 'white': ['European',
  'European-American',
  'Middle Eastern',
  'white',
  'Caucasian',
  'Arab'],
 'combined': ['AAPI',
  'BIPOC',
  'person of color',
  'people of color',
  'man of color',
  'men of color',
  'woman of color',
  'women of color']}

In [61]:
'reviewed' in race_ethnicity

True

In [62]:
'reviewed' in racial_indicators

True

In [74]:
# Example: Gender-related word sets
target_words_white =  ['European',
  'European-American',
  'Middle Eastern',
  'white',
  'Caucasian',
  'Arab']
target_words_black = ['African', 'African-American', 'Black']
target_words_combined = ['AAPI',
  'BIPOC',
  'person of color',
  'people of color',
  'man of color',
  'men of color',
  'woman of color',
  'women of color']

# Example: Career and family-related word sets
target_words_pleasant = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal", "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle", "sunrise", "family", "happy", "laughter", "paradise", "vacation", "joy", "love", "peace", "wonderful", "pleasure", "friend", "laughter", "happy"]
target_words_unpleasant = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison", "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "bomb", "divorce", "jail", "poverty", "ugly", "cancer", "evil", "kill", "rotten", "vomit", "agony", "terrible", "horrible", "nasty", "evil", "war", "awful", "failure"]

In [86]:
def get_word_embedding(word, model):
    word = word.lower().strip()
    if word in model.wv:
        embedding = model.wv[word]
        return embedding
    else:
        print(f"No embedding found for '{word}' in the model.")

In [82]:
from sklearn.metrics import pairwise_distances

def weat_effect_size(X, Y, A, B, model):
    X = get_word_embedding(X, model)
    Y = get_word_embedding(Y, model)
    A = get_word_embedding(A, model)
    B = get_word_embedding(B, model)
    
    mean_XY = (pairwise_distances(X, Y, metric='cosine')).mean()
    mean_AB = (pairwise_distances(A, B, metric='cosine')).mean()
    mean_AX = (pairwise_distances(A, X, metric='cosine')).mean()
    mean_BX = (pairwise_distances(B, X, metric='cosine')).mean()

    effect_size = (mean_XY - mean_AB) / ((mean_AX + mean_BX) / 2)
    return effect_size


In [83]:
import numpy as np

def weat_permutation_test(X, Y, A, B, model, num_permutations=1000):
    target = X + Y
    attribute = A + B

    observed_effect_size = weat_effect_size(X, Y, A, B, model)

    combined = target + attribute
    num_samples = len(combined)

    # Run permutation test
    effect_sizes = np.zeros(num_permutations)
    for i in range(num_permutations):
        np.random.shuffle(combined)
        perm_X = combined[:len(X)]
        perm_Y = combined[len(X):len(X) + len(Y)]
        perm_A = combined[len(X) + len(Y):len(X) + len(Y) + len(A)]
        perm_B = combined[len(X) + len(Y) + len(A):]

        effect_sizes[i] = weat_effect_size(perm_X, perm_Y, perm_A, perm_B, model)

    p_value = (np.abs(effect_sizes) > np.abs(observed_effect_size)).mean()
    return observed_effect_size, p_value


In [90]:
weat_effect_size(target_words_white, target_words_black, target_words_pleasant, target_words_unpleasant, model)

No embedding found for 'middle eastern' in the model.
No embedding found for 'caress' in the model.


  X = np.array([get_word_embedding(word, model) for word in X])
  A = np.array([get_word_embedding(word, model) for word in A])


ValueError: setting an array element with a sequence.

In [92]:
target_words_white

['European',
 'European-American',
 'Middle Eastern',
 'white',
 'Caucasian',
 'Arab']

In [93]:
target_words_black

['African', 'African-American', 'Black']

In [105]:
weat_effect_size(target_words_white, target_words_black, target_words_pleasant, target_words_unpleasant, model)

No embedding found for 'middle eastern' in the model.


TypeError: 'NoneType' object is not iterable

In [103]:
from sklearn.metrics import pairwise_distances
import numpy as np

def get_word_embedding(word, model):
    word_lower = word.lower().strip()
    if word_lower in model.wv:
        embedding = model.wv[word_lower]
        return embedding.reshape(-1, 1)
    else:
        print(f"No embedding found for '{word_lower}' in the model.")
        return None

def weat_effect_size(X, Y, A, B, model):
    X = np.array([get_word_embedding(word, model) for word in X if get_word_embedding(word, model)])
    Y = np.array([get_word_embedding(word, model) for word in Y if get_word_embedding(word, model)])
    A = np.array([get_word_embedding(word, model) for word in A if get_word_embedding(word, model)])
    B = np.array([get_word_embedding(word, model) for word in B if get_word_embedding(word, model)])

    mean_XY = (pairwise_distances(X, Y, metric='cosine')).mean()
    mean_AB = (pairwise_distances(A, B, metric='cosine')).mean()
    mean_AX = (pairwise_distances(A, X, metric='cosine')).mean()
    mean_BX = (pairwise_distances(B, X, metric='cosine')).mean()

    effect_size = (mean_XY - mean_AB) / ((mean_AX + mean_BX) / 2)
    return effect_size

def weat_permutation_test(X, Y, A, B, model, num_permutations=1000):
    target = X + Y
    attribute = A + B

    observed_effect_size = weat_effect_size(X, Y, A, B, model)

    combined = target + attribute
    num_samples = len(combined)

    # Run permutation test
    effect_sizes = np.zeros(num_permutations)
    for i in range(num_permutations):
        np.random.shuffle(combined)
        perm_X = combined[:len(X)]
        perm_Y = combined[len(X):len(X) + len(Y)]
        perm_A = combined[len(X) + len(Y):len(X) + len(Y) + len(A)]
        perm_B = combined[len(X) + len(Y) + len(A):]

        effect_sizes[i] = weat_effect_size(perm_X, perm_Y, perm_A, perm_B, model)

    p_value = (np.abs(effect_sizes) > np.abs(observed_effect_size)).mean()
    return observed_effect_size, p_value

# Example: Run WEAT test for gender bias
observed_effect_size, p_value = weat_permutation_test(target_words_white, target_words_black, target_words_pleasant, target_words_unpleasant, model)

print(f'Observed Effect Size: {observed_effect_size}')
print(f'P-value: {p_value}')


No embedding found for 'middle eastern' in the model.


TypeError: 'NoneType' object is not iterable

# Check bias with `sklearn`

In [39]:
jigsaw.head()

Unnamed: 0,black,white,asian,latino,other_race_or_ethnicity,threat,insult,identity_attack,sexual_explicit,toxicity,obscene,severe_toxicity,identity_annotator_count,toxicity_annotator_count,funny,wow,sad,likes,disagree,id,publication_id,article_id,comment_text,split,created_date,rating
0,1,0,0,0,0,0.0,0.642857,0.614286,0.014286,0.757143,0.471429,0.142857,4,70,0,0,0,0,0,5892815,54,373428,Why is this black racist crap still on the G&M...,train,2017-09-03 23:20:08.226613+00,rejected
1,1,0,0,0,0,0.0,0.213115,0.639344,0.0,0.688525,0.04918,0.032787,4,61,0,0,0,1,0,416437,21,143025,even up here.......BLACKS!,train,2016-08-04 16:48:07.175252+00,rejected
2,1,1,0,0,0,0.25,0.236842,0.631579,0.013158,0.736842,0.0,0.078947,4,76,0,0,0,2,0,377096,21,140782,"""Let's get the black folks and the white folks...",train,2016-07-06 16:49:00.967646+00,approved
3,0,1,0,0,0,0.0,0.5,0.4,0.0,0.8,0.0,0.0,4,10,0,0,0,1,0,5697567,21,361553,Are you a Pilgrim?\nWhy arn't you growing your...,train,2017-08-01 18:39:14.212043+00,approved
4,0,1,0,0,0,0.014085,0.323944,0.323944,0.0,0.507042,0.0,0.0,4,71,0,1,0,7,3,7160163,102,367562,And there it is. Our president is a white supr...,test,2017-08-17 16:19:08.435159+00,approved


In [46]:
jigsaw.rating.value_counts()

approved    29057
rejected     4653
Name: rating, dtype: int64

In [41]:
jigsaw.split.value_counts()

train    30658
test      3052
Name: split, dtype: int64

In [43]:
X = jigsaw[protected_attribute_names]
y_train = jigsaw[jigsaw['split'] == 'train'] >> select('rating')
y_test = jigsaw[jigsaw['split'] == 'test']  >> select('rating')