In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import pandas as pd
import numpy as np
from collections import Counter

# Local imports
from preprocessing import clean_tweets, one_hot_encode, make_debug_df

### First load and clean the hatebase terms

In [89]:
# Clean the hatebase slurs
def clean_slurs(text):
    """Lowercase and underscore join slur words."""
    return text.strip().lower().replace(' ', '_')

slurs = pd.read_csv('data/original_hatebase_slurs.txt', header=None)

# Clean slurs
slur_list = [*map(lambda s: s.lower(), slurs[0].values)]
cleaned_slurs = [*map(clean_slurs, slur_list)]
pluralize_slurs = [s + end for s in cleaned_slurs for end in ['s', 'es']]
full_slur_list = sorted(pluralize_slurs + cleaned_slurs)

# Outputs
slur_map = {s: cs for s, cs in zip(slur_list, cleaned_slurs) if s != cs}
out_slurs = pd.DataFrame(full_slur_list)
out_slurs.to_csv('data/hatebase_slurs.txt', index=None, header=None, encoding='utf-8')

In [90]:
def clean_slurs_in_context(text):
    """Replace slurs with their cleaned versions."""
    for k, v in slur_map.items():
        text = text.replace(k, v)
    return text

def extract_slurs(text):
    """Get a list of all slurs used in the text."""
    text = text.split(' ')
    all_slurs = []
    for s in full_slur_list:
        if s in text:
            all_slurs += [s]
    return all_slurs

### Davidson et al data

In [91]:
path = 'data/davidson/'
fname = '{}labeled_data.csv'.format(path)

In [92]:
# Load the data
df = pd.read_csv(fname, encoding='utf-8', index_col='Unnamed: 0').sample(frac=1)
label_map = {0: 'hate_speech', 1: 'offensive_language', 2: 'neither'}

In [93]:
# Clean the tweets
df = clean_tweets(df)

# Convert columns to one hot encoding
df[['hate_speech', 'offensive_language', 'neither']] = \
    one_hot_encode(df['class'])
df['label'] =df['class'].apply(lambda c: label_map[c])
    
# Clean hate speech terms, and extract slurs
df['tweet'] = df['tweet'].apply(clean_slurs_in_context)
df['slurs'] = df['tweet'].apply(extract_slurs)
    
# Re-order the DataFrame, and drop some columns
df = df[['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
         'hate_speech', 'offensive_language', 'neither']]

In [97]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df)) < train_perc
train = df[msk]
not_train = df[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [98]:
debug = make_debug_df(df)

In [99]:
debug.to_csv('{}debug.csv'.format(path), index=False, encoding='utf-8')
train.to_csv('{}train.csv'.format(path), index=False, encoding='utf-8')
dev.to_csv('{}dev.csv'.format(path), index=False, encoding='utf-8')
test.to_csv('{}test.csv'.format(path), index=False, encoding='utf-8')

### Zeerak data

In [100]:
path = 'data/zeerak_naacl/'
fname = '{}zeerak_naacl_tweets.csv'.format(path)
# Load the data
df2 = pd.read_csv(fname, encoding='utf-8').sample(frac=1)

In [101]:
# First mask out some missing data
msk = ~ df2['label'].apply(lambda t: type(t) is float)
df2 = df2[msk]

In [102]:
# Clean the tweets
df2.rename(index=str, columns={'text': 'tweet'}, inplace=True)
df2 = clean_tweets(df2)

# Label cleanup to match the other df format
labels = ['racism', 'sexism', 'none']
one_hot_label = [labels.index(l) for l in df2['label']]
for l in labels:
    df2[l] = -1

# Convert columns to one hot encoding
df2[['racism', 'sexism', 'none']] = \
    one_hot_encode(one_hot_label)
    
# Clean hate speech terms, and extract slurs
df2['tweet'] = df2['tweet'].apply(clean_slurs_in_context)
df2['slurs'] = df2['tweet'].apply(extract_slurs)
    
# Re-order the DataFrame, and drop some columns
df2 = df2[['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
           'racism', 'sexism', 'none', 'tweet_id', 'user_screen_name']]

In [109]:
# Make a test/dev/train split
train_perc = 0.80
msk = np.random.rand(len(df2)) < train_perc
train = df2[msk]
not_train = df2[~msk]
half = int(len(not_train) / 2)
dev = not_train[:half]
test = not_train[half:]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [110]:
debug = make_debug_df(df2, cols=labels)

In [111]:
debug.to_csv('{}debug.csv'.format(path), index=False, encoding='utf-8')
train.to_csv('{}train.csv'.format(path), index=False, encoding='utf-8')
dev.to_csv('{}dev.csv'.format(path), index=False, encoding='utf-8')
test.to_csv('{}test.csv'.format(path), index=False, encoding='utf-8')

### Looking at number of hatebase terms in our data

In [115]:
Counter([b for a in df2['slurs'] for b in a]).most_common()

[('bitch', 94),
 ('idiot', 79),
 ('pancakes', 37),
 ('idiots', 33),
 ('cunt', 29),
 ('bitches', 22),
 ('eggs', 21),
 ('apple', 18),
 ('property', 17),
 ('trash', 16),
 ('egg', 14),
 ('hoes', 12),
 ('jihadi', 11),
 ('cunts', 11),
 ('jihadis', 10),
 ('fairy', 7),
 ('bubble', 7),
 ('pussy', 6),
 ('charlie', 6),
 ('retarded', 5),
 ('pancake', 5),
 ('tan', 5),
 ('mock', 5),
 ('af', 4),
 ('skinny', 4),
 ('cracker', 4),
 ('twat', 4),
 ('fruit', 3),
 ('bogan', 3),
 ('ghosts', 3),
 ('shade', 3),
 ('dhimmi', 3),
 ('shades', 3),
 ('banana', 3),
 ('ho', 3),
 ('birds', 3),
 ('dhimmis', 3),
 ('queens', 3),
 ('ghost', 3),
 ('bird', 3),
 ('frog', 3),
 ('jocks', 2),
 ('skip', 2),
 ('uncivilized', 2),
 ('zebra', 2),
 ('redneck', 2),
 ('coloured', 2),
 ('shines', 2),
 ('bucks', 2),
 ('colored', 2),
 ('shine', 2),
 ('hoe', 2),
 ('nigga', 2),
 ('bananas', 2),
 ('oriental', 2),
 ('retards', 2),
 ('abo', 2),
 ('queer', 2),
 ('spikes', 2),
 ('apes', 1),
 ('spike', 1),
 ('monkey', 1),
 ('faggots', 1),
 ('peppe

In [116]:
Counter([b for a in df['slurs'] for b in a]).most_common()

[('bitch', 7851),
 ('hoes', 4458),
 ('bitches', 2982),
 ('pussy', 2056),
 ('hoe', 1823),
 ('nigga', 1115),
 ('trash', 1010),
 ('niggas', 733),
 ('faggot', 427),
 ('bird', 371),
 ('charlie', 283),
 ('retarded', 260),
 ('ghetto', 259),
 ('niggah', 253),
 ('yellow', 244),
 ('yankees', 230),
 ('cunt', 229),
 ('nigger', 225),
 ('fag', 220),
 ('birds', 219),
 ('ho', 187),
 ('colored', 174),
 ('nicca', 160),
 ('monkey', 153),
 ('niccas', 111),
 ('retard', 108),
 ('faggots', 108),
 ('nig', 101),
 ('nigguh', 94),
 ('white_trash', 94),
 ('niggers', 88),
 ('brownies', 87),
 ('redneck', 86),
 ('af', 85),
 ('queer', 83),
 ('mock', 78),
 ('dyke', 72),
 ('fags', 67),
 ('crackers', 65),
 ('jihadi', 64),
 ('oreo', 60),
 ('cracker', 60),
 ('yankee', 59),
 ('oreos', 54),
 ('tranny', 49),
 ('redskins', 49),
 ('coon', 48),
 ('teabagger', 46),
 ('sole', 46),
 ('skinny', 46),
 ('teabaggers', 46),
 ('twat', 44),
 ('cunts', 43),
 ('jihadis', 41),
 ('slope', 40),
 ('brownie', 39),
 ('retards', 39),
 ('hillbilly