In [5]:
import pandas as pd
import json 
from preprocess import *
import os
import re

import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

In [7]:
''' 
`count` = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

`hate_speech` = number of CF users who judged the tweet to be hate speech.

`offensive_language` = number of CF users who judged the tweet to be offensive.

`neither` = number of CF users who judged the tweet to be neither offensive nor non-offensive.

`class` = class label for majority of CF users.
  0 - hate speech
  1 - offensive  language
  2 - neither

'''

df = pd.read_csv('labeled_data.csv', index_col=False).drop('Unnamed: 0', axis=1)

# read the json 
folder_path = 'C:/Users/sabin/Downloads/freelancing/permutable.ai/01_handover/00_projects/08_marketing/hackathon/Dataset Challenge #4/ResponsibleNLP-main/ResponsibleNLP-main/holistic_bias/dataset/v1.1'
fairness = {}
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            data_dict = json.load(file)
            dict_name = os.path.splitext(filename)[0]
            fairness[dict_name] = data_dict
del fairness["sentence_templates"]
bias_tags = list(set(list(fairness['descriptors'].keys())+list(fairness['standalone_noun_phrases'].keys())+list(fairness['nouns'].keys())))
bias_df = pd.DataFrame(data=None, columns=bias_tags)

In [36]:

pattern = re.compile(r'\b(?:a|an)\b', flags=re.IGNORECASE)

nationality = fairness['descriptors']['nationality']
nationality = [entry['descriptor'] for descriptors_list in nationality.values() for entry in descriptors_list if 'descriptor' in entry]

race_ethnicity = fairness['descriptors']['race_ethnicity']
race_ethnicity_snp = fairness['standalone_noun_phrases']['race_ethnicity']
race_ethnicity_1 = [entry['descriptor'] for descriptors_list in race_ethnicity.values() for entry in descriptors_list if 'descriptor' in entry]
race_ethnicity_2 = [re.sub(pattern, '', value).strip() for entry in race_ethnicity_snp for value in entry.values() if value != 'reviewed']
race_ethnicity = race_ethnicity_1 + race_ethnicity_2

religion = fairness['descriptors']['religion']
religion_snp = fairness['standalone_noun_phrases']['religion']
religion = ['Ashkenazi Jewish', "Bahá'í", 'Buddhist','Cheondoist', 'Confucianist', 'Conservative Jewish', 'Druze',
        'Hasidic', 'Hindu', 'Jain', 'Jewish', 'Muslim', 'Orthodox Jewish', 'Rasta', 'Rastafari', 'Rastafarian', 'Reform Jewish',
            'Sephardic Jewish', 'Shia', 'Shintoist', 'Sikh', 'Sunni', 'Taoist', 'Zoroastrian', 'Jewish American Princess',
                'Jewish American Princesses', 'JAP', 'JAPs']

hair = ['dreadlocked', 'curly-haired', 'frizzy-haired', 'coily-haired', 'afro', 'afros', 'jewfro', 'jewfros']
skin_color = ['brown-skinned', 'dark-skinned', 'olive-skinned', 'yellow']
immigration = ['asylum seeker', 'asylum seekers', 'refugee', 'refugees', 'immigrant', 'immigrants', 'DACA', 'Dreamer', 'Dreamers']

racial_indicators = nationality + race_ethnicity + religion + hair + skin_color + immigration
racial_indicators = [indicator.lower() for indicator in racial_indicators]

In [37]:
racial_indicators

['chinese',
 'chinese-american',
 'american-born chinese',
 'cuban',
 'cuban-american',
 'dominican',
 'dominican-american',
 'salvadoran',
 'salvadoran-american',
 'guatemalan',
 'guatemalan-american',
 'indian',
 'indian-american',
 'mexican',
 'mexican-american',
 'chicana',
 'chicano',
 'filipina',
 'filipina-american',
 'filipino',
 'filipino-american',
 'korean',
 'korean-american',
 'vietnamese',
 'vietnamese-american',
 'asian',
 'asian-american',
 'desi',
 'east asian',
 'oriental',
 'south asian',
 'southeast asian',
 'african',
 'african-american',
 'black',
 'hispanic',
 'latinx',
 'latine',
 'latina',
 'latino',
 'latin american',
 'pacific islander',
 'aapi',
 'bipoc',
 'person of color',
 'people of color',
 'man of color',
 'men of color',
 'woman of color',
 'women of color',
 'ashkenazi jewish',
 "bahá'í",
 'buddhist',
 'cheondoist',
 'confucianist',
 'conservative jewish',
 'druze',
 'hasidic',
 'hindu',
 'jain',
 'jewish',
 'muslim',
 'orthodox jewish',
 'rasta',
 '

In [34]:
df.to_csv('labeled_data_prep.csv')

In [12]:
import nltk
nltk.download('stopwords')
df['clean_tweet'] = df['tweet'].apply(lambda x: stop(clean_text(x).replace('rt', '')))
df = df.rename(columns={'class': 'label'})
# df = pd.concat([df, bias_df], axis=1).fillna(0)

racial_indicators = racial_indicators + hate_speech_keywords

def protected_attributes(text):
    tweet_racial_indicators = [indicator for indicator in racial_indicators if indicator in text]

    if tweet_racial_indicators:
        return 1
    else:
        return 0

df['protected_attribute'] = df.clean_tweet.apply(lambda tweet: protected_attributes(tweet))

df.sample(n=5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sabin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,count,hate_speech,offensive_language,neither,label,tweet,clean_tweet,protected_attribute
1973,3,0,0,3,2,&#9733; BEST ASIAN MASSAGE ON THE park slope --TOP RATED SPA &#9733; 718-622-0221 - 24 http://t.co/Br7el4ZXrw,best asian massage park slope top rated spa,1
17144,3,1,2,0,1,RT @Runyacheckup: Niggas and bitches gotta quit wit da money pics... If I can count it accurately it ain't enough &#128514;&#128514;&#128514;,niggas bitches gotta quit wit da money pics count accurately enough,1
24037,6,0,6,0,1,"if a bitch don't like me, somethin' wrong wit da bitch",bitch like somethin wrong wit da bitch,0
17807,3,0,3,0,1,"RT @TooRacist: A black man asked me ""Hey white boy, do you like niggers?""\n\n""Well I wouldn't use that word personally"" I said, ""Like is a st&#8230;",black man asked hey white boy like niggers well use word personally said like st,1
9348,3,0,2,1,1,Funnie watching my nig interview on ESPN,funnie watching nig interview espn,0


In [13]:
df.protected_attribute.value_counts(normalize=True)

0    0.872574
1    0.127426
Name: protected_attribute, dtype: float64

In [14]:
df[df.protected_attribute==1].label.value_counts(normalize=True)*100

1    74.509183
0    13.964535
2    11.526282
Name: label, dtype: float64

In [15]:
df[df.protected_attribute==0].label.value_counts(normalize=True)*100

1    77.85896
2    17.56763
0     4.57341
Name: label, dtype: float64

In [27]:
## ROC-AUC diagnostic

def clf(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['clean_tweet'])
    X_test_tfidf = tfidf_vectorizer.transform(X_test['clean_tweet'])

    # Create the RandomForestClassifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the classifier
    rf_classifier.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test_tfidf)
    y_pred_proba = rf_classifier.predict_proba(X_test_tfidf)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Print classification report for detailed metrics
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    return X_train, X_test, X_train_tfidf, X_test_tfidf, y_train, y_test, rf_classifier, y_pred, y_pred_proba, accuracy

X = df[['clean_tweet', 'protected_attribute']]
y = df.label

X_train, X_test, X_train_tfidf, X_test_tfidf, y_train, y_test, rf_classifier, y_pred, y_pred_proba, accuracy = clf(X, y)


Accuracy: 0.8915
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.12      0.19       290
           1       0.91      0.96      0.94      3832
           2       0.85      0.82      0.84       835

    accuracy                           0.89      4957
   macro avg       0.73      0.64      0.65      4957
weighted avg       0.87      0.89      0.88      4957



In [28]:
X_train, X_test, X_train_tfidf, X_test_tfidf, y_train, y_test, rf_classifier, y_pred, y_pred_proba, accuracy = clf(X, y)


Accuracy: 0.8915
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.12      0.19       290
           1       0.91      0.96      0.94      3832
           2       0.85      0.82      0.84       835

    accuracy                           0.89      4957
   macro avg       0.73      0.64      0.65      4957
weighted avg       0.87      0.89      0.88      4957



In [29]:
labels = df.label.sort_values().unique()
print(roc_auc_score(y_test, y_pred_proba, multi_class='ovo', labels=labels))

overall_auc = accuracy

0.8980600595116727


In [30]:
X_privileged = df[df['protected_attribute']==0][['clean_tweet', 'protected_attribute']]
y_privileged = df[df['protected_attribute']==0].label

X_train, X_test, X_train_tfidf, X_test_tfidf, y_train, y_test, rf_classifier, y_pred, y_pred_proba, accuracy = clf(X_privileged, y_privileged)

print(roc_auc_score(y_test, y_pred_proba, multi_class='ovo', labels=labels))

privileged_group_AUC = accuracy

Accuracy: 0.9064
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.20      0.30       198
           1       0.92      0.97      0.94      3351
           2       0.88      0.81      0.84       776

    accuracy                           0.91      4325
   macro avg       0.79      0.66      0.69      4325
weighted avg       0.90      0.91      0.90      4325

0.9211088431536744


In [31]:
X_unprivileged = df[df['protected_attribute']==1][['clean_tweet', 'protected_attribute']]
y_unprivileged = df[df['protected_attribute']==1].label

X_train, X_test, X_train_tfidf, X_test_tfidf, y_train, y_test, rf_classifier, y_pred, y_pred_proba, accuracy = clf(X_unprivileged, y_unprivileged)

print(roc_auc_score(y_test, y_pred_proba, multi_class='ovo', labels=labels))

unprivileged_group_AUC = accuracy

Accuracy: 0.8481
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.10      0.17        82
           1       0.84      0.99      0.91       471
           2       0.95      0.78      0.86        79

    accuracy                           0.85       632
   macro avg       0.84      0.62      0.65       632
weighted avg       0.84      0.85      0.81       632

0.8910552117494084


In [None]:
def calculate_auc(y_true, y_pred, class_labels, multi_class=True, subgroup_mask=None, background_mask=None):
    """
    Calculate AUC for different subsets.

    Parameters:
    - y_true: True labels (binary).
    - y_pred: Predicted probabilities or scores.
    - subgroup_mask: Mask for the subgroup examples.
    - background_mask: Mask for the background examples.

    Returns:
    - Overall AUC and Bias AUCs (Subgroup AUC, BPSN AUC, BNSP AUC).
    """

    # Overall AUC
    if multi_class == False:
        overall_auc = roc_auc_score(y_true, y_pred)
    else:
        overall_auc = roc_auc_score(y_true, y_pred, multi_class='ovo', labels=class_labels)

    # Subgroup AUC
    if subgroup_mask is not None:
        if multi_class == False:
            subgroup_auc = roc_auc_score(y_true[subgroup_mask], y_pred[subgroup_mask])
        else:
            subgroup_auc = roc_auc_score(y_true[subgroup_mask], y_pred[subgroup_mask], multi_class='ovo', labels=class_labels)
    else:
        subgroup_auc = np.nan

    # BPSN AUC
    if background_mask is not None and subgroup_mask is not None:
        if multi_class == False:
            bpsn_mask = np.logical_and(background_mask, ~subgroup_mask)
            bpsn_auc = roc_auc_score(y_true[bpsn_mask], y_pred[bpsn_mask])
        else:
            bpsn_mask = np.logical_and(background_mask, ~subgroup_mask)
            bpsn_auc = roc_auc_score(y_true[bpsn_mask], y_pred[bpsn_mask], multi_class='ovo', labels=class_labels)
    else:
        bpsn_auc = np.nan

    # BNSP
    if background_mask is not None and subgroup_mask is not None:
        if multi_class == False:
            bnsp_mask = np.logical_and(~background_mask, subgroup_mask)
            bnsp_auc = roc_auc_score(y_true[bnsp_mask], y_pred[bnsp_mask])
        else:
            bnsp_mask = np.logical_and(~background_mask, subgroup_mask)
            bnsp_auc = roc_auc_score(y_true[bnsp_mask], y_pred[bnsp_mask], multi_class='ovo', labels=class_labels)
    else:
        bnsp_auc = np.nan

    return overall_auc, subgroup_auc, bpsn_auc, bnsp_auc

# Example usage:
# Specify subgroup and background masks
subgroup_mask = X_test['protected_attribute'] == 1
background_group_mask = X_test['protected_attribute'] == 0

# Calculate AUCs
overall_auc, subgroup_auc, bpsn_auc, bnsp_auc = calculate_auc(y_test, y_pred_proba, labels, multi_class=True, subgroup_mask=subgroup_mask, background_mask=background_group_mask)

# Print the results
print("Overall AUC:", overall_auc)
print("Subgroup AUC:", subgroup_auc)
''' 
Subgroup AUC: Restrict the test set to examples that mention the identity. 
A low value means that the model does poorly at distinguishing abusive and non-abusive examples that mention this identity.

'''
print("BPSN AUC:", bpsn_auc)
''' 
BPSN (Background Positive, Subgroup Negative) AUC: 
Restrict the test set to non-abusive examples that mention the identity and abusive examples that do not. 
A low value suggests that the models scores skew higher than they should for examples mentioning this identity.
''' 
print("BNSP AUC:", bnsp_auc)
''' 
BNSP (Background Negative, Subgroup Positive) AUC: 
Restrict the test set to abusive examples that mention the identity and non-abusive examples that do not. 
A low value suggests that the models scores skew lower than they should for examples mentioning this identity.
''' 

The AUC values you provided indicate the performance of your model on different evaluation subsets:

1. **Overall AUC (0.8980):**
   - This represents the ROC-AUC for the full evaluation set, irrespective of any subgroup or background considerations. A higher overall AUC generally indicates better discrimination between classes.

<br>
  
2. **Subgroup AUC (0.8812):**
   - This specifically measures the model's ability to distinguish positive and negative instances within the subgroup. In your case, it might be the AUC for the protected group. A higher value here suggests better performance in discriminating between classes within the subgroup.

<br>
  
3. **BPSN AUC (0.8785):**
   - This metric evaluates the model's performance on background samples that are predicted as positive but do not belong to the subgroup. Essentially, it measures how well the model distinguishes between true positives from the subgroup and false positives from the background. A higher BPSN AUC indicates better performance in avoiding false positives on the background group.
  
<br>

4. **BNSP AUC (0.8812):**
   - It seems there might be a typo or repetition in the AUC values you provided. Typically, you would have a metric like BNSP (Background Negative, Subgroup Positive) AUC. However, in the values you provided, BNSP AUC has the same value as Subgroup AUC. Please verify whether this is correct.

In [None]:
# from bias_functions.bias_detection import *

# overall_auc, subgroup_auc, bpsn_auc, bnsp_auc, privileged_group_acc, subgroup_accuracy, acc_delta = detect_bias_subgroup(df, 'clean_tweet', 'protected_attribute', multi_class=True)