In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from nltk import word_tokenize, pos_tag
from tqdm import tqdm, tqdm_notebook

### Load data

In [3]:
path = '../data/'
fname = '{}hatebase_slurs.txt'.format(path)
#fname = '{}hatebase+zeerak_exclude_slurs.txt'.format(path)
slurs = pd.read_csv(fname, header=None)[0].values

In [6]:
path = '../data/davidson/'
# path = '../data/zeerak_naacl/'
debug = pd.read_csv('{}debug.csv'.format(path), encoding='utf-8')
train = pd.read_csv('{}train.csv'.format(path), encoding='utf-8')
dev = pd.read_csv('{}dev.csv'.format(path), encoding='utf-8')
test = pd.read_csv('{}test.csv'.format(path), encoding='utf-8')

In [7]:
def safe_join(word_list):
    joined = ''
    last_word = None
    for i, w in enumerate(word_list):
        if last_word == '<' or w == '>' or i == 0:
            joined += w
        else:
            joined += ' ' + w
        last_word = w
    return joined

def pos_replace(tweet):
    """Repalce slurs with their POS tags."""
    tokenized_tweet = word_tokenize(tweet)
    pos_tweet = pos_tag(tokenized_tweet)
    pos_replaced = [t if t not in slurs else pos for (t, pos) in pos_tweet]
    return safe_join(pos_replaced)

def unk_replace(tweet):
    """Replace slurs with <UNK>."""
    tokenized_tweet = word_tokenize(tweet)
    unk_replaced = [t if t not in slurs else '<UNK>' for t in tokenized_tweet]
    return safe_join(unk_replaced)

def slur_remove(tweet):
    """Remove slurs from the sentence."""
    tokenized_tweet = word_tokenize(tweet)
    slur_removed = [t for t in tokenized_tweet if t not in slurs ]
    return safe_join(slur_removed)
    

In [8]:
all_datasets = {'train': train, 'test': test, 'dev': dev}
new_cols = ['tweet_unk_slur', 'tweet_no_slur', 'tweet_pos_slur']

for k, d in all_datasets.items():
    tqdm.write('Processing {}, length: {}'.format(k, len(d)))
    tweets = d['tweet'].values
    proc_tweets = []
    for c in new_cols:
        d[c] = ''
    for i, t in enumerate(tqdm_notebook(tweets)):
        proc_tweets.append( [unk_replace(t), slur_remove(t), pos_replace(t)])
    d[new_cols] = proc_tweets

Processing train, length: 19856


A Jupyter Widget


Processing test, length: 2464


A Jupyter Widget


Processing dev, length: 2463


A Jupyter Widget




In [9]:
train.to_csv('{}train.csv'.format(path), index=None, encoding='utf-8')
dev.to_csv('{}dev.csv'.format(path), index=None, encoding='utf-8')
test.to_csv('{}test.csv'.format(path), index=None, encoding='utf-8')