In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import pandas as pd
import numpy as np
from collections import Counter
from nltk import word_tokenize, pos_tag
from tqdm import tqdm, tqdm_notebook

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

In [69]:
#path = '../data/davidson/'
path = '../data/zeerak_naacl/'
# path = '../data/wiki_talk/'
# debug = pd.read_csv('{}debug.csv'.format(path), encoding='utf-8')
train = pd.read_csv('{}train.csv'.format(path), encoding='utf-8')
dev = pd.read_csv('{}dev.csv'.format(path), encoding='utf-8')
test = pd.read_csv('{}test.csv'.format(path), encoding='utf-8')

In [72]:
set(test['label'])

{'hate_speech', 'none'}

In [31]:
# Fix labels for davidson
new_train_labels = ['neither' if l == 'offensive_language' else l
                    for l in train['label']]
new_test_labels = ['neither' if l == 'offensive_language' else l
                    for l in test['label']]
train['label'] = new_train_labels
test['label'] = new_test_labels

In [71]:
# Fix labels for zeerak
new_train_labels = [l if l == 'none' else 'hate_speech'
                    for l in train['label']]
new_test_labels = [l if l == 'none' else 'hate_speech'
                    for l in test['label']]
train['label'] = new_train_labels
test['label'] = new_test_labels

In [73]:
vectorizer = CountVectorizer()
vectorizer.fit(train['tweet'])
X_train = vectorizer.transform(train['tweet'])
y_train = train['label'].values

X_test = vectorizer.transform(test['tweet'])
y_test = test['label'].values

In [78]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
p, r, f1, s = precision_recall_fscore_support(y_train, y_train_pred, average='weighted')
acc = accuracy_score(y_train, y_train_pred)
print(f"Training\n\tPrecision: {p:.2f}\n\tRecall: {r:.2f}\n\tF1-Score: {f1:.2f}\n\tAccuracy: {acc:.2f}")

y_test_pred = clf.predict(X_test)
p, r, f1, s = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
acc = accuracy_score(y_test, y_test_pred)
print(f"Training\n\tPrecision: {p:.2f}\n\tRecall: {r:.2f}\n\tF1-Score: {f1:.2f}\n\tAccuracy: {acc:.2f}")

Training
	Precision: 0.93
	Recall: 0.93
	F1-Score: 0.93
	Accuracy: 0.93
Training
	Precision: 0.84
	Recall: 0.84
	F1-Score: 0.84
	Accuracy: 0.84


In [79]:
coef_df = pd.DataFrame(columns=['word', 'coef'])

In [80]:
coef_df['coef'] = clf.coef_[0]
for w, i in tqdm_notebook(vectorizer.vocabulary_.items()):
    coef_df.loc[i, 'word'] = w

A Jupyter Widget




In [81]:
coef_df.sort_values(by='coef', ascending=True, inplace=True)
coef_df.reset_index(drop=True, inplace=True)

In [67]:
# Davidson output
#coef_df.to_csv('../output/davidson_log_reg_coef.csv', encoding='utf-8', index=False)

In [83]:
# Zeerak output
coef_df.to_csv('../output/zeerak_log_reg_coef.csv', encoding='utf-8', index=False)

### Error Analysis

In [83]:
d_path = '../data/davidson/'
z_path = '../data/zeerak_naacl/'
o2_path = '../output/davidson_2way_gradrev_0.5/'
o3_path = '../output/davidson-zeerak_davidson_2018-04-25T10-55/'

d_test = pd.read_csv(f'{d_path}test.csv', encoding='utf-8')
d_pred = pd.read_pickle(f'{o2_path}test_preds.pkl')

z_test = pd.read_csv(f'{z_path}test.csv', encoding='utf-8')
z_pred = pd.read_pickle(f'{o3_path}test_preds.pkl')

In [35]:
d_test['pred'] = d_pred
d_test = d_test[['tweet', 'label', 'pred']].copy()
d_test.loc[:, 'label'] = [1 if l == 'hate_speech' else 0 for l in d_test['label']]
d_test['correct'] = [*map(int, d_test['label'] == d_test['pred'])]

z_test['pred'] = z_pred
z_test = z_test[['tweet', 'label', 'pred']].copy()
z_test.loc[:, 'label'] = [0 if l == 'none' else 1 for l in z_test['label']]
z_test['correct'] = [*map(int, z_test['label'] == z_test['pred'])]

In [41]:
Counter(d_test['label'])

Counter({0: 2318, 1: 146})

In [43]:
print("Davidson 2-way:")
tn, fp, fn, tp = confusion_matrix(d_test.label, d_test.pred).ravel()
print(f"\ttn: {tn}\n\tfp: {fp}\n\tfn: {fn}\n\ttp: {tp}")
print("Davidson-Zeerak:")
tn, fp, fn, tp = confusion_matrix(z_test.label, z_test.pred).ravel()
print(f"\ttn: {tn}\n\tfp: {fp}\n\tfn: {fn}\n\ttp: {tp}")

Davidson 2-way:
	tn: 2246
	fp: 72
	fn: 94
	tp: 52
Davidson-Zeerak:
	tn: 990
	fp: 95
	fn: 426
	tp: 59


In [None]:
# Pull out 10 examples of fp and fn for each dataset

In [66]:
def get_error_samples(df):
    """Return two df's that are fp and fn."""
    fp_msk = (df['correct'] == 0) & (df['pred'] == 1)
    fp_ex = df[fp_msk].copy()
    
    fn_msk = (df['correct'] == 0) & (df['pred'] == 0)
    fn_ex = df[fn_msk].copy()
    
    return fp_ex, fn_ex

In [67]:
d_fp, d_fn = get_error_samples(d_test)
z_fp, z_fn = get_error_samples(z_test)

In [81]:
out_path = '../output/error_analysis/'
d_fname = '{}davidson_grad_rev_{}.csv'
z_fname = '{}davidson-zeerak_{}.csv'

In [82]:
d_fp.to_csv(d_fname.format(out_path, 'fp'), encoding='utf-8', index=False)
d_fn.to_csv(d_fname.format(out_path, 'fn'), encoding='utf-8', index=False)
z_fp.to_csv(z_fname.format(out_path, 'fp'), encoding='utf-8', index=False)
z_fn.to_csv(z_fname.format(out_path, 'fn'), encoding='utf-8', index=False)

In [73]:
d_fn.loc[662, 'tweet']

'every spic cop in # losangeles has raped or has thought about raping a white woman that s why those half_breeds get jobs as pigs'

In [64]:
d_fp.loc[2161].tweet

'<MENTION> 1 2 3 4 how many niggers are in my store i knowwwww your stealing 😂😂😂 that vine still be having me weak'

In [61]:
msk = ['africa' in t for t in z_fp.tweet]
z_fp[msk].tweet

1501    # mkr the fat south africans are feeling hopeful
Name: tweet, dtype: object