In [1]:
import numpy as np
import pandas as pd


- **EDA**
    - Data Loading
    - Dtype check
    - Missing value check
    - null value check
    - unique value check
    - drop duplicate
    - drop null value
    - check for outliers
    - check for correlation
- **Modeling**
    - data prep
    - model selection
        - baseline model based on the task
    - model training on train data
    - model evaluation on test data 


In [38]:
pd.set_option('display.max_columns', 200)

In [2]:
# load data
binary_data = pd.read_csv('../data/Ethos_Binary.csv', delimiter=';')
multilabel_data  = pd.read_csv('../data/Ethos_Multi_Label.csv', delimiter=';')

In [3]:
binary_data.head(5)

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0


In [39]:
multilabel_data

Unnamed: 0,comment,violence,directed_vs_generalized,gender,race,national_origin,disability,religion,sexual_orientation
0,You should know women's sports are a joke,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0
1,You look like Sloth with deeper Down’s syndrome,0.000000,0.833333,0.000000,0.000000,0.000000,1.000000,0.000000,0
2,You look like Russian and speak like Indian. B...,1.000000,0.500000,0.142857,0.142857,0.714286,0.000000,0.000000,0
3,"Women deserve to be abused, I guess.",0.666667,0.166667,1.000000,0.000000,0.000000,0.000000,0.000000,0
4,Women are made for making babies and cooking d...,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...
428,All people are fucked up because of these quee...,0.000000,0.000000,0.571429,0.000000,0.000000,0.000000,0.000000,0.571428571
429,Actually THIS WEIRD PEOPLE are HARMFUL! They n...,0.000000,1.000000,0.000000,0.000000,0.333333,0.666667,0.000000,0
430,ABU DA BI ABA DU BA BA HAMUD! See? I’m an arab...,0.000000,0.000000,0.000000,0.285714,0.000000,0.000000,0.857143,0
431,a homo? so you fuck your brother instead of yo...,0.000000,0.666667,0.000000,0.000000,0.000000,0.000000,0.000000,1


In [21]:
#check dtypes
binary_data.dtypes
multilabel_data.dtypes

comment                     object
violence                   float64
directed_vs_generalized    float64
gender                     float64
race                       float64
national_origin            float64
disability                 float64
religion                   float64
sexual_orientation          object
dtype: object

In [9]:
multilabel_data.columns

Index(['comment', 'violence', 'directed_vs_generalized', 'gender', 'race',
       'national_origin', 'disability', 'religion',
       'sexual_orientation,,,,,,,,,,'],
      dtype='object')

In [15]:
#remname the headache
multilabel_data.rename(columns={'sexual_orientation,,,,,,,,,,':'sexual_orientation'}, inplace=True)

In [16]:
multilabel_data.columns

Index(['comment', 'violence', 'directed_vs_generalized', 'gender', 'race',
       'national_origin', 'disability', 'religion', 'sexual_orientation'],
      dtype='object')

In [18]:
multilabel_data['sexual_orientation'] = multilabel_data['sexual_orientation'].str.replace(',', '')

In [35]:
binary_data['isHate'].value_counts()

isHate
0.000000    354
1.000000    163
0.166667    106
0.833333    100
0.333333     80
0.500000     74
0.666667     70
0.250000      6
0.750000      6
0.857143      3
0.018868      2
0.400000      2
0.903846      2
0.846154      1
0.966667      1
0.016393      1
0.973333      1
0.026316      1
0.029851      1
0.030303      1
0.031746      1
0.037736      1
0.038961      1
0.090909      1
0.103448      1
0.111111      1
0.152542      1
0.160714      1
0.200000      1
0.849057      1
0.954545      1
0.296875      1
0.302326      1
0.945455      1
0.978261      1
0.983871      1
0.530612      1
0.603448      1
0.937500      1
0.678571      1
0.722222      1
0.821429      1
0.983607      1
Name: count, dtype: int64

In [34]:
# Define a tolerance level
tolerance = 1e-6  # For example, 1e-6 (0.000001)

# Filter the DataFrame with a tolerance level
filtered_data = binary_data[(binary_data['isHate'] >= 0.018868 - tolerance) & 
                            (binary_data['isHate'] <= 0.018868 + tolerance)]

# Get the value counts of the filtered data
value_counts = filtered_data['isHate'].value_counts()
print(value_counts)


isHate
0.018868    2
Name: count, dtype: int64


In [31]:
multilabel_data['religion'].value_counts()

religion
0.000000    332
1.000000     39
0.666667      8
0.833333      6
0.750000      5
0.857143      5
0.500000      4
0.600000      3
0.285714      2
0.642857      2
0.400000      2
0.333333      2
0.800000      2
0.250000      2
0.444444      1
0.109091      1
0.571429      1
0.829231      1
0.428571      1
0.214286      1
0.380952      1
0.720000      1
0.142857      1
0.685714      1
0.342857      1
0.171429      1
0.514286      1
0.618648      1
0.166667      1
0.781841      1
Name: count, dtype: int64

In [40]:
#counting the null values 
multilabel_data.isna().sum()

comment                    0
violence                   3
directed_vs_generalized    3
gender                     3
race                       3
national_origin            3
disability                 3
religion                   3
sexual_orientation         3
dtype: int64

In [47]:
#check for duplicated
multilabel_data.duplicated(subset=['directed_vs_generalized', 'gender', 'race']) 

0      False
1      False
2      False
3      False
4       True
       ...  
428    False
429     True
430     True
431     True
432    False
Length: 433, dtype: bool

In [43]:
multilabel_data.columns

Index(['comment', 'violence', 'directed_vs_generalized', 'gender', 'race',
       'national_origin', 'disability', 'religion', 'sexual_orientation'],
      dtype='object')

In [71]:
#building dataset for the tast
sexist = pd.DataFrame(multilabel_data[multilabel_data['gender']>=.5]['comment'])


In [72]:
racist = pd.DataFrame(multilabel_data[multilabel_data['race']>= .5]['comment'])

In [73]:
not_hate = binary_data[binary_data['isHate']==0]

In [74]:
sexist['category'] = 'sexist'

In [76]:
racist['category'] = 'racist'

In [78]:
comments = pd.concat([sexist, racist])

In [81]:
comments.reset_index(drop=True)

Unnamed: 0,comment,category
0,You should know women's sports are a joke,sexist
1,"Women deserve to be abused, I guess.",sexist
2,Women are made for making babies and cooking d...,sexist
3,What about a global massacre of men? Pussiezzz...,sexist
4,We men enjoy the reproductive rights of paying...,sexist
...,...,...
155,It's so sick and sad that young people will th...,racist
156,I guess God is white....,racist
157,Go bleach yourself black pussy doll and then c...,racist
158,"Believe me, whites are superior in every way t...",racist
