In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from nltk import word_tokenize, pos_tag
from tqdm import tqdm, tqdm_notebook

### Load the data

In [None]:
#path = '../data/davidson/'
path = '../data/zeerak_naacl/'
train = pd.read_csv('{}train.csv'.format(path), encoding='utf-8')

In [None]:
none_msk = train['none'] == 1
off_msk = ~ none_msk

### Do Word Counts

In [None]:
def get_word_counts(df):
    return Counter(' '.join(df['tweet']).split(' '))

def normalize(df):
    df['offensive_norm'] = df['offensive'] / sum(df['offensive'])
    df['none_norm'] = df['none'] / sum(df['none'])
    return df

def llr(wc1, wc2):
    return -np.log(wc1 / wc2)

def compute_llr(df):
    llr_list = []
    for wc1, wc2 in tqdm_notebook(zip(df['offensive_norm'], df['none_norm'])):
        llr_list.append(nll(wc1, wc2))
    print(len(nll_list))
    df['log_ratio'] = nll_list
    return df.sort_values('log_ratio')

In [None]:
off_wc = get_word_counts(train[off_msk])
non_wc = get_word_counts(train[none_msk])

In [None]:
wc_df = pd.DataFrame(columns=['offensive', 'none'])

# Add counts from each class
for w, c in tqdm_notebook(off_wc.items()):
    wc_df.loc[w, 'offensive'] = c
for w, c in tqdm_notebook(non_wc.items()):
    wc_df.loc[w, 'none'] = c
    
wc_df.fillna(1, inplace=True)
wc_df['total'] = wc_df['offensive'] + wc_df['none']

wc_df = normalize(wc_df)
wc_df = compute_nll(wc_df)

wc_df['weighted_ratio'] = wc_df['log_ratio'] * wc_df['total']

In [None]:
wc_df.sort_values('weighted_ratio', ascending=True)[:100].index.values

In [None]:
exclude_terms = ['sexist', 'islam', 'women', 'muslims', 'notsexist',
               'mohammed', 'female', 'muslims', 'girls', 'men', 'woman',
               'man', 'prophet', 'religion', 'jews', 'quran', 'girl',
               'slave', 'hatred', 'feminists', 'feminist', 'females',
               'feminism', 'hate', 'rape', 'womenagainstfeminism',
               'questionsformen', 'slavery', 'murdering', 'bigotry',
               'equal', 'slaves', 'christians', 'hindus', 'israel',
               'terrorist', 'islamic', 'barbarity', 'blondes']

### Save the exclusion terms

Along with the original hatebase slurs

In [None]:
path = '../data/'
fname = '{}hatebase_slurs.txt'.format(path)
slurs = pd.read_csv(fname, header=None)[0].values

In [None]:
new_slurs = np.concatenate([exclude_terms, slurs])

In [None]:
new_slurs_df = pd.DataFrame(new_slurs)
new_slurs_df.to_csv('{}hatebase+zeerak_exclude_slurs.txt'.format(path),
                    index=None, header=None, encoding='utf-8')