## Preprocessing

In [1]:
from preprocessing.hateval2019 import read_dataset

path = './data/hateval2019/hateval2019_en_train.csv'

In [2]:
tweet_col = 'text'
label_col = 'HS'

In [3]:
data = read_dataset(path)

In [4]:
data.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways #USER# #...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,#USER# Illegals Dump their Kids at the border ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


In [5]:
data[label_col].value_counts()

0    5217
1    3783
Name: HS, dtype: int64

## Stats

In [6]:
from eda.get_stats_tweets import *

In [7]:
def _display_mean(df, col):
    res = {}
    for x in df[label_col].unique():
        res.update({x: data[data[label_col]==x][col].mean()})
    return res


In [8]:
print('TR:', _display_mean(data, 'TR'))
print('AG:', _display_mean(data, 'AG'))

TR: {1: 0.35448057097541635, 0: 0.0}
AG: {1: 0.4121067935500925, 0: 0.0}


In [9]:
data['hashtags'] = data[tweet_col].apply(count_per_tweet_hashtags)
data['urls'] = data[tweet_col].apply(count_per_tweet_urls)
data['users'] = data[tweet_col].apply(count_per_tweet_users)
data['rt'] = data[tweet_col].apply(count_per_tweet_rt)


# print out
print('#HASHTAG#:', _display_mean(data, 'hashtags'))
print('#URL#:', _display_mean(data, 'urls'))
print('#USER#:', _display_mean(data, 'users'))
print('##RT##:', _display_mean(data, 'rt'))

#HASHTAG#: {1: 1.1186888712661909, 0: 0.4864864864864865}
#URL#: {1: 0.3013481363996828, 0: 0.7021276595744681}
#USER#: {1: 0.7774253238170764, 0: 0.6024535173471344}
##RT##: {1: 0.0050224689399947136, 0: 0.009200690051753882}


In [10]:
data['uppercase_chars'] = data[tweet_col].apply(count_per_tweet_uppercase_chars)
data['chars'] = data[tweet_col].apply(count_per_tweet_chars)
data['uppercase_words'] = data[tweet_col].apply(count_per_tweet_uppercase_words)
data['words'] = data[tweet_col].apply(count_per_tweet_words)

# print out
print('upper_chars:', _display_mean(data, 'uppercase_chars'))
print('chars:', _display_mean(data, 'chars'))
print('upper_words:', _display_mean(data, 'uppercase_words'))
print('words:', _display_mean(data, 'words'))

upper_chars: {1: 7.506740681998414, 0: 5.11366685834771}
chars: {1: 111.5289452815226, 0: 108.19685643089899}
upper_words: {1: 3.9682791435368756, 0: 3.4182480352693116}
words: {1: 22.177636796193497, 0: 20.711520030668968}


In [11]:
data['stopwords'] = data[tweet_col].apply(count_per_tweet_stopwords)


# print out
print('number of stop-words:', _display_mean(data, 'stopwords'))


number of stop-words: {1: 9.461538461538462, 0: 8.74238067855089}


In [12]:
data['emojis'] = data[tweet_col].apply(count_per_tweet_emojis)

# print out
print('number of emojis:', _display_mean(data, 'emojis'))

number of emojis: {1: 0.14697330161247688, 0: 0.12363427257044278}


In [13]:
data['sentiment_negative'], data['sentiment_neutral'], data['sentiment_positive'] = zip(*data[tweet_col].map(get_per_tweet_sentiments))
data['sentiment'] = data[tweet_col].apply(get_per_tweet_sentiments_raw)


# print out
print('sentiment_negative:', _display_mean(data, 'sentiment_negative'))
print('sentiment_neutral:', _display_mean(data, 'sentiment_neutral'))
print('sentiment_positive:', _display_mean(data, 'sentiment_positive'))
print('sentiment:', _display_mean(data, 'sentiment'))


sentiment_negative: {1: 0.42585249801744646, 0: 0.2744872532106575}
sentiment_neutral: {1: 0.2712133227597145, 0: 0.3530764807360552}
sentiment_positive: {1: 0.302934179222839, 0: 0.37243626605328733}
sentiment: {1: -0.05425643999511804, 0: 0.01816755009090132}


In [14]:
# out['PER'], out['ORG'], out['LOC'], out['MISC']
data['NER_PER'], data['NER_ORG'], data['NER_LOC'], data['NER_MISC'] = zip(*data[tweet_col].map(get_per_tweet_named_entities))

# print out
print('NER_PER:', _display_mean(data, 'NER_PER'))
print('NER_ORG:', _display_mean(data, 'NER_ORG'))
print('NER_LOC:', _display_mean(data, 'NER_LOC'))
print('NER_MISC:', _display_mean(data, 'NER_MISC'))


NER_PER: {1: 0.24742268041237114, 0: 0.2762123825953613}
NER_ORG: {1: 0.22944752841660057, 0: 0.1960897067280046}
NER_LOC: {1: 0.2920962199312715, 0: 0.3049645390070922}
NER_MISC: {1: 0.6500132170235263, 0: 0.5012459267778416}


## Save features

In [15]:
data[[
    label_col, tweet_col, 'AG', 'TR',
    'hashtags', 'urls', 'users', 'rt', 
    'uppercase_chars', 'chars', 'uppercase_words', 'words', 
    'stopwords', 'emojis',
    'sentiment_negative', 'sentiment_neutral', 'sentiment_positive', 'sentiment',
    'NER_PER', 'NER_ORG', 'NER_LOC', 'NER_MISC'
]].to_csv('./data/hateval2019_en_train_preprocessed.csv', index=False)