In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm, tqdm_notebook

# Local imports
from preprocessing import clean_tweets, one_hot_encode, make_debug_df

### First load and clean the hatebase terms

In [4]:
# Clean the hatebase slurs
def clean_slurs(text):
    """Lowercase and underscore join slur words."""
    return text.strip().lower().replace(' ', '_')

slurs = pd.read_csv('../data/original_hatebase_slurs.txt', header=None)

# Clean slurs
slur_list = [*map(lambda s: s.lower(), slurs[0].values)]
cleaned_slurs = [*map(clean_slurs, slur_list)]
pluralize_slurs = [s + end for s in cleaned_slurs for end in ['s', 'es']]
full_slur_list = sorted(pluralize_slurs + cleaned_slurs)

# Outputs
slur_map = {s: cs for s, cs in zip(slur_list, cleaned_slurs) if s != cs}
#out_slurs = pd.DataFrame(full_slur_list)
#out_slurs.to_csv('data/hatebase_slurs.txt', index=None, header=None, encoding='utf-8')

In [3]:
def clean_slurs_in_context(text):
    """Replace slurs with their cleaned versions."""
    for k, v in slur_map.items():
        text = text.replace(k, v)
    return text

def extract_slurs(text):
    """Get a list of all slurs used in the text."""
    text = text.split(' ')
    all_slurs = []
    for s in full_slur_list:
        if s in text:
            all_slurs += [s]
    return all_slurs

### Davidson et al data

In [None]:
path = '../data/davidson/'
fname = '{}labeled_data.csv'.format(path)

In [None]:
# Load the data
df = pd.read_csv(fname, encoding='utf-8', index_col='Unnamed: 0').sample(frac=1)
label_map = {0: 'hate_speech', 1: 'offensive_language', 2: 'neither'}

In [None]:
# Clean the tweets
df = clean_tweets(df)

# Convert columns to one hot encoding
df[['hate_speech', 'offensive_language', 'neither']] = \
    one_hot_encode(df['class'])
df['label'] =df['class'].apply(lambda c: label_map[c])
    
# Clean hate speech terms, and extract slurs
df['tweet'] = df['tweet'].apply(clean_slurs_in_context)
df['slurs'] = df['tweet'].apply(extract_slurs)
    
# Re-order the DataFrame, and drop some columns
df = df[['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
         'hate_speech', 'offensive_language', 'neither']]

In [None]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df)) < train_perc
train = df[msk]
not_train = df[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
debug = make_debug_df(df)

In [None]:
debug.to_csv('{}debug.csv'.format(path), index=False, encoding='utf-8')
train.to_csv('{}train.csv'.format(path), index=False, encoding='utf-8')
dev.to_csv('{}dev.csv'.format(path), index=False, encoding='utf-8')
test.to_csv('{}test.csv'.format(path), index=False, encoding='utf-8')

### Zeerak data

In [None]:
path = '../data/zeerak_naacl/'
fname = '{}zeerak_naacl_tweets.csv'.format(path)
# Load the data
df2 = pd.read_csv(fname, encoding='utf-8').sample(frac=1)

In [None]:
# First mask out some missing data
msk = ~ df2['label'].apply(lambda t: type(t) is float)
df2 = df2[msk]

In [None]:
# Clean the tweets
df2.rename(index=str, columns={'text': 'tweet'}, inplace=True)
df2 = clean_tweets(df2)

# Label cleanup to match the other df format
labels = ['racism', 'sexism', 'none']
one_hot_label = [labels.index(l) for l in df2['label']]
for l in labels:
    df2[l] = -1

# Convert columns to one hot encoding
df2[['racism', 'sexism', 'none']] = \
    one_hot_encode(one_hot_label)
    
# Clean hate speech terms, and extract slurs
df2['tweet'] = df2['tweet'].apply(clean_slurs_in_context)
df2['slurs'] = df2['tweet'].apply(extract_slurs)
    
# Re-order the DataFrame, and drop some columns
df2 = df2[['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
           'racism', 'sexism', 'none', 'tweet_id', 'user_screen_name']]

In [None]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df2)) < train_perc
train = df2[msk]
not_train = df2[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
debug = make_debug_df(df2, cols=labels)

In [None]:
debug.to_csv('{}debug.csv'.format(path), index=False, encoding='utf-8')
train.to_csv('{}train.csv'.format(path), index=False, encoding='utf-8')
dev.to_csv('{}dev.csv'.format(path), index=False, encoding='utf-8')
test.to_csv('{}test.csv'.format(path), index=False, encoding='utf-8')

### Wiki talk data

In [5]:
path = '../data/wiki_talk/'
fname = '{}labeled_data.csv'.format(path)

In [6]:
# Load the data
df = pd.read_csv(fname, encoding='utf-8')# , index_col='Unnamed: 0').sample(frac=1)

In [7]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
df['com']

In [10]:
'toxic'.title()

'Toxic'

In [43]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

neutral_msk = np.array([True]*len(df))
for l in labels:
    neutral_msk = neutral_msk & ~np.array([*map(bool, df[l])])
msk2 = [len(text) < 60 for text in df['comment_text']]

# Print total
print("Total comments: {}".format(len(df)))

# Print neutral
print("Neutral comments: {}".format(sum(neutral_msk)))
msk = [m1 and m2 for m1, m2 in zip(neutral_msk, msk2)]
for ex in df[msk]['comment_text'].sample(5):
    print('\t', ex.replace('\n', ' '))

for l in labels:
    print("{} comments: {}".format(l.title(), sum(df[l])))
    msk1 = [*map(bool, df[l])]
    msk = [m1 and m2 for m1, m2 in zip(msk1, msk2)]
    for ex in df[msk]['comment_text'].sample(5):
        print('\t', ex.replace('\n', ' '))

Total comments: 159571
Neutral comments: 143346
	 And which country would gain free benifit?
	 (I'm lonely and sad.)
	 Some pictures for Miranda...
	 thank u   u seen rele nice 2 give me a second chance!!
	 Talk Talk to me please!
Toxic comments: 15294
	 fuck u to buddy, i know that was thomas, i aint stupid
	 none of us are perfect ok, get off me
	 Because you touch yourself at night.
	 whoa   you are a big fat idot, stop spamming my userspace
	 2013 (UTC) He is obviously homosexual   17:39, 3 August
Severe_Toxic comments: 1595
	 FUCK YOU   FUCK YOU ASSHOLE
	 regarding you being an asshole go fuck your mother.
	 fuck you faggot  fuck you faggot
	 fucK of ff f f f f f f f f
	 FUCK YOU ALL!   GO TO HELL!
Obscene comments: 8449
	 Fuck you Juliancolton.
	 Ill photo shop a dick in his illuminati mouth
	 Go fuck yourself traitor. Thank you.
	 Fuck you asshole. Allahu Akbar.
	 deleting good articles fuck bag
Threat comments: 478
	 Sitush is an asshole...die you dog
	 Loganberry   i will kill

### Looking at number of hatebase terms in our data

In [None]:
Counter([b for a in df2['slurs'] for b in a]).most_common()

In [None]:
Counter([b for a in df['slurs'] for b in a]).most_common()