# Data Preparation for Evaluation Datasets
Data sources used for training our models:
- Toxigen: https://github.com/microsoft/toxigen
- Hatemoji: https://github.com/HannahKirk/Hatemoji

Data sources used for additional evaluation at the end:
- HateCheck: https://hatecheck.ai/download/
- Jigsaw: https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data?select=all_data.csv

In this notebook, we prepare the data downloaded from HuggingFace and Kaggle and export data subsets which we will feed into our models. The data preparation process includes data cleaning, extracting data subsets according to identity category, and splitting each identity subset into train, validation, and test sets.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

In [4]:
access_token = "REDACTED"

# Prepare HateCheckCorpus data
We'll be using data from the HateCheckCorpus to evaluate our models at the end for analysis.

In [5]:
from datasets import load_dataset
hatecheck_dataset = load_dataset("Paul/hatecheck")

Downloading readme:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

Downloading and preparing dataset csv/Paul--hatecheck to /root/.cache/huggingface/datasets/Paul___csv/Paul--hatecheck-de74109e77bfb6c3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/652k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/Paul___csv/Paul--hatecheck-de74109e77bfb6c3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
hatecheck_dataset

DatasetDict({
    test: Dataset({
        features: ['functionality', 'case_id', 'test_case', 'label_gold', 'target_ident', 'direction', 'focus_words', 'focus_lemma', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ'],
        num_rows: 3728
    })
})

In [7]:
hatecheck_df = pd.DataFrame(hatecheck_dataset['test'])
hatecheck_df = hatecheck_df.rename(columns={"test_case": "comment_text", "label_gold": "toxicity_binary"})
hatecheck_df['toxicity_binary'] = (hatecheck_df['toxicity_binary'] == 'hateful').astype(int)
hatecheck_df

Unnamed: 0,functionality,case_id,comment_text,toxicity_binary,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ
0,derog_neg_emote_h,1,I hate women.,1,women,general,hate,hate,,,1,I hate [IDENTITY_P].
1,derog_neg_emote_h,2,I hate trans people.,1,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].
2,derog_neg_emote_h,3,I hate gay people.,1,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].
3,derog_neg_emote_h,4,I hate black people.,1,black people,general,hate,hate,,,1,I hate [IDENTITY_P].
4,derog_neg_emote_h,5,I hate disabled people.,1,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].
...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,1,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,1,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,1,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,1,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.


In [8]:
hatecheck_disability_df = hatecheck_df[hatecheck_df['target_ident']=='disabled people']
hatecheck_gender_df = hatecheck_df[(hatecheck_df['target_ident']=='women') | (hatecheck_df['target_ident']=='trans people')]
hatecheck_sexual_orientation_df = hatecheck_df[hatecheck_df['target_ident']=='gay people']
hatecheck_race_df = hatecheck_df[hatecheck_df['target_ident']=='black people']
hatecheck_religion_df = hatecheck_df[hatecheck_df['target_ident']=='Muslims']
hatecheck_nationality_df = hatecheck_df[hatecheck_df['target_ident']=='immigrants']

print('len(hatecheck_disability_df): \t\t', len(hatecheck_disability_df))
print('len(hatecheck_gender_df): \t\t', len(hatecheck_gender_df))
print('len(hatecheck_sexual_orientation_df): \t', len(hatecheck_sexual_orientation_df))
print('len(hatecheck_race_df): \t\t', len(hatecheck_race_df))
print('len(hatecheck_religion_df): \t\t', len(hatecheck_religion_df))
print('len(hatecheck_nationality_df): \t\t', len(hatecheck_nationality_df))

len(hatecheck_disability_df): 		 484
len(hatecheck_gender_df): 		 972
len(hatecheck_sexual_orientation_df): 	 551
len(hatecheck_race_df): 		 482
len(hatecheck_religion_df): 		 484
len(hatecheck_nationality_df): 		 463


### Export HateCheck Identity Group DataFrames to csv

In [9]:
hatecheck_disability_df.to_csv('drive/MyDrive/data/disability-dataset-hatecheck.csv')
hatecheck_gender_df.to_csv('drive/MyDrive/data/gender-dataset-hatecheck.csv')
hatecheck_sexual_orientation_df.to_csv('drive/MyDrive/data/sexual_orientation-dataset-hatecheck.csv')
hatecheck_race_df.to_csv('drive/MyDrive/data/race-dataset-hatecheck.csv')
hatecheck_religion_df.to_csv('drive/MyDrive/data/religion-dataset-hatecheck.csv')
hatecheck_nationality_df.to_csv('drive/MyDrive/data/nationality-dataset-hatecheck.csv')

# Prepare Jigsaw data:
The kaggle competition corresponding to this dataset came with csv files for their own train and test subset. However, since the competition has ended, the `all_data.csv` file was released containing labels for both the train and test sets. Therefore, we'll be using the `all_data.csv` as our starting dataset.

In [10]:
jigsaw_all_data_df = pd.read_csv('drive/MyDrive/data/jigsaw_raw_all_data.csv')

## Clean the data

EDA revealed that there were some rows with a missing value for `comment_text`. What does these rows look like?

In [11]:
jigsaw_all_data_df[pd.isna(jigsaw_all_data_df["comment_text"])]

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
446630,392337,,train,2016-07-18 19:34:48.278774+00,13,392165.0,141670,approved,0,0,...,,,,,,,,,0,4


### Delete the rows with missing comments
Since we'll have no input text to feed in for these rows, it will be unusable and therefore we'll remove them from our dataset.

In [12]:
jigsaw_all_data_df_cleansed = jigsaw_all_data_df.copy().drop(index=jigsaw_all_data_df[pd.isna(jigsaw_all_data_df['comment_text'])].index)

We can see that we now have a few less lines in our dataset:

In [13]:
jigsaw_all_data_df.shape

(1999516, 46)

In [14]:
jigsaw_all_data_df_cleansed.shape

(1999515, 46)

### Drop the columns we won't be using

In [15]:
jigsaw_all_data_df_cleansed = jigsaw_all_data_df_cleansed.drop(columns=['id', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes'])
jigsaw_all_data_df_cleansed.head()

Unnamed: 0,comment_text,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,He got his money... now he lies in wait till a...,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,...,,,,,,,,,0,67
1,Mad dog will surely put the liberals in mental...,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,...,,,,,,,,,0,76
2,And Trump continues his lifelong cowardice by ...,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,...,,,,,,,,,0,63
3,"""while arresting a man for resisting arrest"".\...",0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,...,,,,,,,,,0,76
4,Tucker and Paul are both total bad ass mofo's.,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,...,,,,,,,,,0,80


In [16]:
pd.options.display.max_colwidth = 600

In [17]:
jigsaw_all_data_df_cleansed.iloc[[715484-1]]

Unnamed: 0,comment_text,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
715484,"So a Christian can pledge allegiance to their religion first, but if a Muslim does then it's automatically more suspect? I'm not suggesting you explicitly said that, but inciting Sharia law or beheadings at the mention of Islam invokes that kind of double standard. As if Islam itself is to blame, not a complex mix if politics, history, and culture that leads to the extremes we see.\n\nI'm trying to get at the core assumptions here, as to the kind of rhetoric I see as harmful and unfair in it's correlation. It's this general belief that peaceful devotion to Islam is fundamentally dangerous ...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004457,...,0.0,0.000557,0.0,0.0039,0.000557,0.0,0.0,0.0,1795,4


### Add `toxicity_binary` column
Here we set the binary toxicity label for non-disability identity groups. For these groups, comments with toxicity >= 0.5 are considered toxict and will have a value of 1. Comments with toxicity < 0.5 will have a value of 0.

In [18]:
jigsaw_all_data_df_cleansed['toxicity_binary'] = (jigsaw_all_data_df_cleansed['toxicity'] >= 0.5).astype(int)

In [19]:
jigsaw_all_data_df_cleansed[['toxicity','toxicity_binary']]

Unnamed: 0,toxicity,toxicity_binary
0,0.373134,0
1,0.605263,1
2,0.666667,1
3,0.815789,1
4,0.550000,1
...,...,...
1999511,0.400000,0
1999512,0.400000,0
1999513,0.400000,0
1999514,0.400000,0


Move the new `toxicity_binary` column towards the front of the dataframe:

In [20]:
orig_cols = jigsaw_all_data_df_cleansed.columns.tolist()
reordered_cols = orig_cols[:2] + orig_cols[-1:] + orig_cols[2:-1]
jigsaw_all_data_df_cleansed = jigsaw_all_data_df_cleansed[reordered_cols]
jigsaw_all_data_df_cleansed.head()

Unnamed: 0,comment_text,disagree,toxicity_binary,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,0,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,...,,,,,,,,,0,67
1,Mad dog will surely put the liberals in mental hospitals. Boorah,0,1,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,...,,,,,,,,,0,76
2,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,7,1,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,...,,,,,,,,,0,63
3,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",0,1,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,...,,,,,,,,,0,76
4,Tucker and Paul are both total bad ass mofo's.,0,1,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,...,,,,,,,,,0,80


## Prepare Disability Subset

#### Create disability subset

In [21]:
jigsaw_disability_df = jigsaw_all_data_df_cleansed[(jigsaw_all_data_df_cleansed["physical_disability"] > 0) | 
           (jigsaw_all_data_df_cleansed["intellectual_or_learning_disability"] > 0) | 
           (jigsaw_all_data_df_cleansed["psychiatric_or_mental_illness"] > 0) | 
           (jigsaw_all_data_df_cleansed["other_disability"] > 0)]

In [22]:
jigsaw_disability_df.shape

(18665, 36)

### Add `'disability_subtypes_total` column
This column will indicate how likely a comment is to mention disability.

In [23]:
jigsaw_disability_df['disability_subtypes_total'] = jigsaw_disability_df['physical_disability']+jigsaw_disability_df['intellectual_or_learning_disability']+jigsaw_disability_df['psychiatric_or_mental_illness']+jigsaw_disability_df['other_disability']
jigsaw_disability_df['disability_subtypes_total']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jigsaw_disability_df['disability_subtypes_total'] = jigsaw_disability_df['physical_disability']+jigsaw_disability_df['intellectual_or_learning_disability']+jigsaw_disability_df['psychiatric_or_mental_illness']+jigsaw_disability_df['other_disability']


7705       0.250000
8073       1.000000
8115       1.000000
8125       1.750000
8263       1.000000
             ...   
1999482    0.600000
1999507    0.700000
1999508    0.500000
1999514    0.003717
1999515    0.000640
Name: disability_subtypes_total, Length: 18665, dtype: float64

### Operationalize comments **mentioning disability** and toxic **ableist** comments

Need to operaztionalize the following:

1. What is considered *disability-related*?
1. What is considered *ableist* or *toxic towards people with disabilities*?

Comments where the disability score for each subtypes <= 0.1 mostly seems like they're not related to disability at all prob because there's not enough consensus on disability-relatedness scores. There are a few that have *slight mention* of disability, but most of them don't mention disability at all. In the interest of time, we'll drop rows where disability_subtype total <= 0.1.

In [24]:
test_filter_condition = jigsaw_disability_df['disability_subtypes_total']<=0.1
display(jigsaw_disability_df.loc[test_filter_condition,
                    ['comment_text','toxicity','insult', 'threat','physical_disability','intellectual_or_learning_disability','psychiatric_or_mental_illness','other_disability','disability_subtypes_total','identity_annotator_count']])

Unnamed: 0,comment_text,toxicity,insult,threat,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,disability_subtypes_total,identity_annotator_count
9884,"China is the worst possible ""global partner"" anyone could have, it is the biggest dysfunctional and theft-based society with ""entitlement"" attitude - and clearly the wealthy that immigrated here made Chinese-only ""organizations"". What a disgrace.",0.434211,0.381579,0.000000,0.000000,0.0,0.1,0.00000,0.100000,10
9902,"Trump is already selling his logo to Hindu contractors. In addition, he has to pay interest to the Russian mafia, who will accept access to projects as payment. Trump is selling his brand in Argentina in exchange for dollars. You Trump people are completely brain-dead. This will blow up in America's face very quickly. You and those like you are complete fools. Trump probably is immune. Let's see if the shameful idiotic Republican Congress has the balls to impeach this cheesy crotch-grabbing pervert lying bustard snake oil salesman. As if!",0.750000,0.750000,0.000000,0.000000,0.0,0.1,0.00000,0.100000,10
9904,"LW2: Your daughter is a nut. Amy's right, she needs to get help.",0.714286,0.714286,0.014286,0.000000,0.0,0.1,0.00000,0.100000,10
9907,"What a piece of GARBAGE! Obviously written by a left wing nut that resents the knowledge, experience and power that comes with getting older. It seems that she (and many other clueless people) think that 'aging', working your way up and being a useful part of society in your 'golden years' is something to be ashamed of.\n\nBeing 'old' and part of society is not a dirty word, it's something to be proud of. Remember lady one day you will be an 'old white woman' how you going to feel then?",0.728571,0.657143,0.000000,0.000000,0.0,0.1,0.00000,0.100000,10
9944,"great, this guy was a phony, just like trump. and like trump he is a racist, bigot and stupid.",0.866667,0.866667,0.000000,0.000000,0.1,0.0,0.00000,0.100000,10
...,...,...,...,...,...,...,...,...,...,...
1999269,"Pedophiles are rarely interested in a child's sexual orientation; they are interested in satisfying their cravings in an illegal and socially unacceptable manner.\nSince there are ever so many more heterosexuals in the world than there are homosexuals, most child molestation is perpetrated by heterosexuals; also most is perpetrated by men. So, you are suggesting that heterosexual men are trying to change the sexual orientation of children?\nAnother deep breath might be in order.\nIt would be most worthwhile to read the following article by a research psychologist at the University of Cali...",0.400000,0.000000,0.000000,0.000000,0.0,0.1,0.00000,0.100000,10
1999375,"I'm glad you brought this up, Mike AA. And there's more. As someone who has lost my husband to cancer and have been through the indescribable anguish and unbearable pain of sitting at the bedside of your terminally ill loved one you're about to have to let go, what I found particularly outrageous among the many new lows Trump has stooped to in the last few days: His call to terminally ill people to stick around long enough so they can vote for him on November 8. This man is devoid of even the smallest spark of compassion and decency.",0.400000,0.400000,0.000000,0.100000,0.0,0.0,0.00000,0.100000,10
1999377,"Paul: I mention Jim Jones as the extreme example of the power of charismatic demogoguery. We already saw Trump invite his supporters to physically attack protesters at his rallies.....and they did so. I don't think he'll be passing out KoolAid, but I do fear if he's defeated he may very well refer to that ""fixed"" election which could easily encourage violent behavior from the most zealous of his followers. \n\nRe: the crying baby. If, indeed, he was ""goofin"", it was in extremely poor taste. Embarassing someone for a ""laugh"" is not funny. The woman had come to his rally. She was a T...",0.400000,0.400000,0.000000,0.000000,0.1,0.0,0.00000,0.100000,10
1999514,I just don't find her a very good representation of the transexual community. She just seems so self-absorbed & concerned with such superficial issues.,0.400000,0.100000,0.000000,0.003717,0.0,0.0,0.00000,0.003717,269


For example, this comment is not disability-related at all:

In [25]:
list(jigsaw_disability_df.loc[[1307202]]['comment_text'])[0]

'Real men eat oil for breakfast!'

This comment is longer, but it still is not disability-related:

In [26]:
list(jigsaw_disability_df.loc[[1627111]]['comment_text'])[0]

"I wouldn't say that at all.  High school students have traditionally been held responsible, and often expelled, for their conduct out of school.  Black students have a right not to feel threatened by fellow students, just as black co-workers have.  You make a mistake in saying the right is restricted to commercial interest - in both cases."

### Drop the rows that are unrelated to disability

In [27]:
jigsaw_disability_cleaned_df = jigsaw_disability_df.loc[jigsaw_disability_df['disability_subtypes_total']>0.1]
print('# disability rows before: ', len(jigsaw_disability_df))
print('# disability rows after: ', len(jigsaw_disability_cleaned_df))

# disability rows before:  18665
# disability rows after:  15158


After exploring thresholds, positive labels for `toxicity_binary` **should be `toxicity >=  0.25`**, not `toxicity >= 0.5`.

In [28]:
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity_binary']==1
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity_binary']==0
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity']==0)
# test_filter_condition = (jigsaw_disability_cleaned_df['toxicity']>0) & (jigsaw_disability_cleaned_df['toxicity']<0.5)
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity']>0
# test_filter_condition = (jigsaw_disability_cleaned_df['toxicity']>0) & (jigsaw_disability_cleaned_df['toxicity']<0.3) # these should all be negative
# test_filter_condition = (jigsaw_disability_cleaned_df['toxicity']>0.3) & (jigsaw_disability_cleaned_df['toxicity']<0.35) # should be positive
# test_filter_condition = (jigsaw_disability_cleaned_df['toxicity']>0.2) & (jigsaw_disability_cleaned_df['toxicity']<0.25) # GREY AREA
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity']>0.22) & (jigsaw_disability_cleaned_df['toxicity']<0.25) # some should be negative, some positive
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity']==0 # 8379 rows - CLEARLY NEGATIVE
# test_filter_condition = jigsaw_disability_cleaned_df['toxicity']<0.25 # 12007 rows - NEGATIVE
test_filter_condition = jigsaw_disability_cleaned_df['toxicity']<0.05
# test_filter_condition = (jigsaw_disability_cleaned_df['toxicity']>=0.25) # 6491 rows - POSITIVE
# test_filter_condition = disability_condition # <-- FOR BOTH NEGATIVE AND POSITIVE

jigsaw_disability_cleaned_df.loc[test_filter_condition,
                    ['comment_text','toxicity','identity_attack','insult', 'threat','physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability','identity_annotator_count','toxicity_annotator_count']]

Unnamed: 0,comment_text,toxicity,identity_attack,insult,threat,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
141881,"SNAP solves a very real and very immediate problem -- that of feeding people who cannot buy food with their own resources. These are real people, from infants to very old, working people, disabled people, those disadvantaged by societal expectations they can't meet, et., etc., etc. I spent the first month of food bank (SNAP and local charity funded) volunteer service in shock, meeting clients and helping them obtain VERY LITTLE food that needed to last them a month. \n\nFamiliarity along with empathy really do help achieve genuine ""charity"" -- the virtue you preach about interminably.",0.008666,0.004333,0.004333,0.000867,0.1,0.0,0.000000,0.100000,10,1154
207222,"Finally a politician who cares what constituents want! No one thinks people with a history of mental illness should have guns. No one needs an automatic weapon. Thanks , Manka!",0.029412,0.000000,0.029412,0.000000,0.0,0.0,0.500000,0.000000,10,34
272394,"I can't help but feel that if life is so bad that people have to get high to bear it, then something is dreadfully wrong with our society as a whole. I more than understand that life can have it's times of depression for a myriad of reasons, but to give one's soul and Spirit to some form of drug, just seems like an act of desperation. Why is there so much of it, especially among the younger generations?",0.023288,0.002740,0.016438,0.000685,0.0,0.0,0.000000,0.166667,6,1460
536064,"Here's other ideas: proper housing, clean water, roads on reserves, actual schools with tenured teachers, lower food prices in northern stores, mental health counsellors, doctors, nurses, hospitals, long term care facilities, community centres ...\n\nMs. Stronach, please put your considerable means and influence towards these improvements first before you start on your laptop program.",0.009332,0.000718,0.008615,0.000718,0.0,0.0,0.700000,0.000000,10,1393
715508,"You're right Greeleaf. I hadn't thought that he/she may be on medication or have a mental health problem. I did not intend to demean him/her ... just trying to help.\nMy apologies.\n\nBest,\n\nRTD",0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,4,4
...,...,...,...,...,...,...,...,...,...,...,...
1883916,"Recent experience and on topic discussions have shown Alaska's fiscal crisis as a real threat to the quality of all our lives. This particular tragedy is one , I wish could be debated as , times are tough , and dollars to help people looking for it, or deemed in need of help , are not there. \nThat is not the case when talking Mental Health , because there has not been dollars for the Mentally Ill for quite some time. The State when allocating dollars signing up with the new federal program, allocated 0 dollars for Mental Health, Let's repeat that amount for it truly does tell the sto...",0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.800000,0.000000,10,4
1883957,"Wait, do I understand this right? This kid was on anti-psychotics when he committed the crime. He's now being held involuntarily in a mental hospital which wants to treat him by putting him back on anti-psychotics. And the hospital has to rely on the courts to give them the right to put this patient back on the medication he was getting before he was arrested?\n\nI'm all for mental patients' civil rights, but this does seem pretty ridiculous.",0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.700000,0.000000,10,4
1883959,"There is so much more to why people are in prison and commit crimes. Many of the prisoners are people with mental illness that the courts deem too dangerous to let out. Thus they are given extreme sentences and left to rot inside. Most are damaged psychically and emotionally and often due to negligence or abuse as children. Understanding that these people are committing crimes in our society we need to deal with that, but we need to change the way we look at crime therapy and getting people back to being members of our communities. Remember that all of these people were once your neighbors...",0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.600000,0.000000,10,4
1883976,"What is the theological misconception? What is the scientific reality? We know that people are born with disabilities both physical and mental, that is the reality. The reality also is that someone with an perfectly normal male body is of the male sex and not female. The same goes for a woman with a perfectly normal female body is of the female sex and not male.\nA middle-aged married man with children who suddenly decides he is in the wrong body obviously has a mental problem. To humour him and encourage him to pretend that he is a woman is cruel. Medical science can give him the cosmeti...",0.000000,0.000000,0.000000,0.000000,0.6,0.0,0.700000,0.000000,10,4


### Modify `toxicity_binary` column for disability
Here we set the binary toxicity label for comments where disability is mentioned as determined by human annotators.

- Initially, we were going to set the toxicity label according to the dataset designers' recommendation where comments with toxicity >= 0.5 are considered toxict. However, EDA revealed that many comments with toxicity < 0.5 are toxic towards people with disabilites. Therefore, 0.5 is not the appropriate threshold here.
- After careful inspection of the comments where disability is mentioned, determining whether a comment is toxic is less obvious/in a grey area at toxicity=0.25.
- Therefore, we create a toxicity_disability column that maps comments where toxicity < 0.25 to 0 (negative/non-toxic) and comments with toxicity >= 0.25 to 1 (positive/toxic). This column will serve as our labels to train and evaluate on.

In [29]:
jigsaw_disability_cleaned2_df = jigsaw_disability_cleaned_df.copy()
jigsaw_disability_cleaned2_df['toxicity_binary'] = (jigsaw_disability_cleaned2_df['toxicity'] >= 0.25).astype(int)
jigsaw_disability_df = jigsaw_disability_cleaned2_df

In [30]:
jigsaw_disability_df[['toxicity','toxicity_binary']]

Unnamed: 0,toxicity,toxicity_binary
7705,0.689655,1
8073,0.800000,1
8115,0.790323,1
8125,0.352941,1
8263,0.842857,1
...,...,...
1999476,0.400000,1
1999478,0.400000,1
1999482,0.400000,1
1999507,0.400000,1


## Prepare non-disability subsets

Create gender subset:

In [31]:
jigsaw_gender_df = jigsaw_all_data_df_cleansed[(jigsaw_all_data_df_cleansed['male'] > 0) | 
           (jigsaw_all_data_df_cleansed['female'] > 0) | 
           (jigsaw_all_data_df_cleansed['transgender'] > 0) | 
           (jigsaw_all_data_df_cleansed['other_gender'] > 0)]
jigsaw_gender_df.shape

(137722, 36)

Create sexual orientation subset:

In [32]:
jigsaw_sexual_orientation_df = jigsaw_all_data_df_cleansed[(jigsaw_all_data_df_cleansed['heterosexual'] > 0) | 
           (jigsaw_all_data_df_cleansed['homosexual_gay_or_lesbian'] > 0) | 
           (jigsaw_all_data_df_cleansed['bisexual'] > 0) | 
           (jigsaw_all_data_df_cleansed['other_sexual_orientation'] > 0)]
jigsaw_sexual_orientation_df.shape

(22649, 36)

Create religion subset:

In [33]:
jigsaw_religion_df = jigsaw_all_data_df_cleansed[(jigsaw_all_data_df_cleansed['christian'] > 0) | 
           (jigsaw_all_data_df_cleansed['jewish'] > 0) | 
           (jigsaw_all_data_df_cleansed['muslim'] > 0) | 
           (jigsaw_all_data_df_cleansed['hindu'] > 0) | 
           (jigsaw_all_data_df_cleansed['buddhist'] > 0) | 
           (jigsaw_all_data_df_cleansed['atheist'] > 0) | 
           (jigsaw_all_data_df_cleansed['other_religion'] > 0)]
jigsaw_religion_df.shape

(101410, 36)

Create race subset:

In [34]:
jigsaw_race_df = jigsaw_all_data_df_cleansed[(jigsaw_all_data_df_cleansed['black'] > 0) | 
           (jigsaw_all_data_df_cleansed['white'] > 0) | 
           (jigsaw_all_data_df_cleansed['asian'] > 0) | 
           (jigsaw_all_data_df_cleansed['latino'] > 0) | 
           (jigsaw_all_data_df_cleansed['other_race_or_ethnicity'] > 0)]
jigsaw_race_df.shape

(71648, 36)

## Export jigsaw identity datasets to csv

In [35]:
jigsaw_disability_df.to_csv('drive/MyDrive/data/disability-dataset-jigsaw.csv')
jigsaw_gender_df.to_csv('drive/MyDrive/data/gender-dataset-jigsaw.csv')
jigsaw_sexual_orientation_df.to_csv('drive/MyDrive/data/sexual_orientation-dataset-jigsaw.csv')
jigsaw_religion_df.to_csv('drive/MyDrive/data/religion-dataset-jigsaw.csv')
jigsaw_race_df.to_csv('drive/MyDrive/data/race-dataset-jigsaw.csv')