# Building Hate Speech Detection Systems


To create a local copy of this notebook: File > Save a copy to Drive 

In [1]:
# Imports
import os
import pandas as pd

## Getting hands on the data

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!git clone https://github.com/preyero/multidisciplinary-data-challenge.git

In [3]:
PATH_TO_FOLDER = '/content/gdrive/MyDrive/path_to_folder/challenge'
d_train = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'data', 'train.csv'))
print(d_train.shape)

d_train_aggr = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'data', 'train_aggregate.csv'))
print(d_train_aggr.shape)

(93819, 131)
(26508, 66)


higher = more hateful and lower = less hateful.
* over 0.5 is approximately hate speech,
* < -1 is counter or supportive speech,
* and -1 to +0.5 is neutral or ambiguous.

## Some data variables

In [None]:
# Variable names
text_col, id_col = 'text', 'comment_id'
model_outputs = ['sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score']
target_identities = ['target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race',
                     'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion',
                     'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin',
                     'target_gender_men', 'target_gender_non_binary', 'target_gender_transgender_men', 'target_gender_transgender_unspecified', 'target_gender_transgender_women', 'target_gender_women', 'target_gender_other', 'target_gender',
                     'target_sexuality_bisexual', 'target_sexuality_gay', 'target_sexuality_lesbian', 'target_sexuality_straight', 'target_sexuality_other', 'target_sexuality',
                     'target_age_children', 'target_age_teenagers', 'target_age_young_adults', 'target_age_middle_aged', 'target_age_seniors', 'target_age_other', 'target_age',
                     'target_disability_physical', 'target_disability_cognitive', 'target_disability_neurological', 'target_disability_visually_impaired', 'target_disability_hearing_impaired', 'target_disability_unspecific', 'target_disability_other', 'target_disability']

annotator_demographics = ['annotator_gender', 'annotator_trans', 'annotator_educ', 'annotator_income', 'annotator_ideology',
                          'annotator_gender_men', 'annotator_gender_women', 'annotator_gender_non_binary', 'annotator_gender_prefer_not_to_say', 'annotator_gender_self_describe', 'annotator_transgender', 'annotator_cisgender', 'annotator_transgender_prefer_not_to_say',
                          'annotator_education_some_high_school', 'annotator_education_high_school_grad', 'annotator_education_some_college', 'annotator_education_college_grad_aa', 'annotator_education_college_grad_ba', 'annotator_education_professional_degree', 'annotator_education_masters', 'annotator_education_phd',
                          'annotator_income_<10k', 'annotator_income_10k-50k', 'annotator_income_50k-100k', 'annotator_income_100k-200k', 'annotator_income_>200k',
                          'annotator_ideology_extremeley_conservative', 'annotator_ideology_conservative', 'annotator_ideology_slightly_conservative', 'annotator_ideology_neutral', 'annotator_ideology_slightly_liberal', 'annotator_ideology_liberal', 'annotator_ideology_extremeley_liberal', 'annotator_ideology_no_opinion',
                          'annotator_race_asian', 'annotator_race_black', 'annotator_race_latinx', 'annotator_race_middle_eastern', 'annotator_race_native_american', 'annotator_race_pacific_islander', 'annotator_race_white', 'annotator_race_other',
                          'annotator_age', 'annotator_religion_atheist', 'annotator_religion_buddhist', 'annotator_religion_christian', 'annotator_religion_hindu', 'annotator_religion_jewish', 'annotator_religion_mormon', 'annotator_religion_muslim', 'annotator_religion_nothing', 'annotator_religion_other',
                          'annotator_sexuality_bisexual', 'annotator_sexuality_gay', 'annotator_sexuality_straight', 'annotator_sexuality_other']


In [None]:
d_train_aggr[model_outputs].describe

<bound method NDFrame.describe of        sentiment   respect    insult  humiliate    status  dehumanize  \
0       3.333333  3.333333  3.333333   2.666667  2.666667    1.666667   
1       2.333333  3.000000  3.000000   2.666667  2.333333    1.666667   
2       3.500000  2.500000  2.000000   2.500000  2.500000    2.000000   
3       3.333333  3.000000  1.333333   1.000000  2.333333    1.333333   
4       3.333333  3.666667  3.333333   2.666667  3.333333    2.666667   
...          ...       ...       ...        ...       ...         ...   
26503   2.333333  1.666667  1.000000   0.666667  2.000000    0.333333   
26504   2.500000  3.500000  3.250000   2.500000  2.000000    1.250000   
26505   4.000000  3.750000  3.500000   3.500000  3.000000    2.750000   
26506   0.500000  0.500000  0.000000   0.000000  1.500000    0.000000   
26507   4.000000  4.000000  4.000000   3.666667  3.333333    1.000000   

       violence  genocide  attack_defend  hatespeech  hate_speech_score  
0      0.666667