## Preprocessing

In [1]:
from preprocessing.hs import read_dataset
import pandas as pd

path0 = './data/hs/neither.json'
path1 = './data/hs/sexism.json'
path2 = './data/hs/racism.json'


In [2]:
tweet_col = 'text'
label_col = 'HS'

In [3]:
df0 = read_dataset(path0)
df0 = df0[[tweet_col]]
df0[label_col] = 'none'

df1 = read_dataset(path1)
df1 = df1[[tweet_col]]
df1[label_col] = 'sexism'

df2 = read_dataset(path2)
df2 = df2[[tweet_col]]
df2[label_col] = 'racism'


In [4]:
data = pd.concat([df0, df1, df2])

data.head()

Unnamed: 0,text,HS
0,Someone is going home #HASHTAG# ...that obviou...,none
1,They didn't even wash the chicken ðŸ˜© #HASHTAG#,none
2,#HASHTAG# Is honestly so fucking staged. The m...,none
3,Can someone smash that bottle of Rose &amp; Li...,none
4,Will someone pls assist Colin in the washing o...,none


In [5]:
data[label_col].value_counts()

none      11501
sexism     3430
racism     1976
Name: HS, dtype: int64

## Stats

In [6]:
from eda.get_stats_tweets import *

In [7]:
def _display_mean(df, col):
    res = {}
    for x in df[label_col].unique():
        res.update({x: data[data[label_col]==x][col].mean()})
    return res


In [8]:
data['hashtags'] = data[tweet_col].apply(count_per_tweet_hashtags)
data['urls'] = data[tweet_col].apply(count_per_tweet_urls)
data['users'] = data[tweet_col].apply(count_per_tweet_users)
data['rt'] = data[tweet_col].apply(count_per_tweet_rt)


# print out
print('#HASHTAG#:', _display_mean(data, 'hashtags'))
print('#URL#:', _display_mean(data, 'urls'))
print('#USER#:', _display_mean(data, 'users'))
print('##RT##:', _display_mean(data, 'rt'))

#HASHTAG#: {'none': 0.5565602991044257, 'sexism': 0.6338192419825073, 'racism': 0.14827935222672065}
#URL#: {'none': 0.18102773671854622, 'sexism': 0.10349854227405247, 'racism': 0.23481781376518218}
#USER#: {'none': 0.7901921572037214, 'sexism': 0.8276967930029154, 'racism': 1.757085020242915}
##RT##: {'none': 0.1629423528388836, 'sexism': 0.2813411078717201, 'racism': 0.10981781376518218}


In [9]:
data['uppercase_chars'] = data[tweet_col].apply(count_per_tweet_uppercase_chars)
data['chars'] = data[tweet_col].apply(count_per_tweet_chars)
data['uppercase_words'] = data[tweet_col].apply(count_per_tweet_uppercase_words)
data['words'] = data[tweet_col].apply(count_per_tweet_words)

# print out
print('upper_chars:', _display_mean(data, 'uppercase_chars'))
print('chars:', _display_mean(data, 'chars'))
print('upper_words:', _display_mean(data, 'uppercase_words'))
print('words:', _display_mean(data, 'words'))

upper_chars: {'none': 3.3131032084166594, 'sexism': 3.5912536443148686, 'racism': 3.6153846153846154}
chars: {'none': 68.67672376315103, 'sexism': 78.27725947521866, 'racism': 90.62044534412955}
upper_words: {'none': 1.9821754630032171, 'sexism': 2.0323615160349853, 'racism': 3.033906882591093}
words: {'none': 14.018954873489262, 'sexism': 16.121282798833818, 'racism': 17.941295546558706}


In [10]:
data['stopwords'] = data[tweet_col].apply(count_per_tweet_stopwords)


# print out
print('number of stop-words:', _display_mean(data, 'stopwords'))


number of stop-words: {'none': 6.15616033388401, 'sexism': 7.124781341107871, 'racism': 7.911943319838056}


In [11]:
data['emojis'] = data[tweet_col].apply(count_per_tweet_emojis)

# print out
print('number of emojis:', _display_mean(data, 'emojis'))

number of emojis: {'none': 0.041474654377880185, 'sexism': 0.0239067055393586, 'racism': 0.004554655870445344}


In [12]:
data['sentiment_negative'], data['sentiment_neutral'], data['sentiment_positive'] = zip(*data[tweet_col].map(get_per_tweet_sentiments))
data['sentiment'] = data[tweet_col].apply(get_per_tweet_sentiments_raw)


# print out
print('sentiment_negative:', _display_mean(data, 'sentiment_negative'))
print('sentiment_neutral:', _display_mean(data, 'sentiment_neutral'))
print('sentiment_positive:', _display_mean(data, 'sentiment_positive'))
print('sentiment:', _display_mean(data, 'sentiment'))


sentiment_negative: {'none': 0.24215285627336752, 'sexism': 0.2991253644314869, 'racism': 0.29605263157894735}
sentiment_neutral: {'none': 0.4203112772802365, 'sexism': 0.36151603498542273, 'racism': 0.4397773279352227}
sentiment_positive: {'none': 0.33753586644639594, 'sexism': 0.33935860058309036, 'racism': 0.26417004048583}
sentiment: {'none': 0.03590589948873221, 'sexism': 0.007386581006840549, 'racism': -0.02519112576488589}


In [13]:
# out['PER'], out['ORG'], out['LOC'], out['MISC']
data['NER_PER'], data['NER_ORG'], data['NER_LOC'], data['NER_MISC'] = zip(*data[tweet_col].map(get_per_tweet_named_entities))

# print out
print('NER_PER:', _display_mean(data, 'NER_PER'))
print('NER_ORG:', _display_mean(data, 'NER_ORG'))
print('NER_LOC:', _display_mean(data, 'NER_LOC'))
print('NER_MISC:', _display_mean(data, 'NER_MISC'))


NER_PER: {'none': 0.2561516389879141, 'sexism': 0.2096209912536443, 'racism': 0.28795546558704455}
NER_ORG: {'none': 0.13946613337970612, 'sexism': 0.1274052478134111, 'racism': 0.14423076923076922}
NER_LOC: {'none': 0.09999130510390401, 'sexism': 0.053644314868804666, 'racism': 0.18775303643724697}
NER_MISC: {'none': 0.29380053908355797, 'sexism': 0.4122448979591837, 'racism': 0.9195344129554656}


## Save features

In [14]:
data[[
    label_col, tweet_col,
    'hashtags', 'urls', 'users', 'rt', 
    'uppercase_chars', 'chars', 'uppercase_words', 'words', 
    'stopwords', 'emojis',
    'sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment',
    'NER_PER', 'NER_ORG', 'NER_LOC', 'NER_MISC'
]].to_csv('./data/hs__preprocessed.csv', index=False)