Misspellings of curse words?  

When building vocab, consider punctuation. Tokenize it first? Right now it just splits on whitespace. Moses probably has something for this

In [71]:
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack

data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
bad_words_location = 'data/trimmed-bad-words.txt'
np.random.seed(1234) # help reproducibility

In [72]:
# y == 0 if not offensive
# y == 1 if offensive
with open(data) as f:
    raw = [x.strip().split('\t') for x in f.readlines()[1:]]
    x_raw = [r[1] for r in raw]
    y = np.array([0 if r[2] == 'NOT' else 1 for r in raw])

with open(bad_words_location) as f:
    bad_words = [row[:-1] for row in f.readlines()[1:]]
    
with open('data/jigsaw/train.csv') as f:
    pass
    # can't use .readlines() because that splits by '\n'. you need to read by character
    # raw = [x.strip().split(',') for x in f.readlines()[1:-1]]

In [8]:
# Build vocab
id2word = {}
word2id = {}
i = 0 # index of unique word
for tweet in x_raw:
    for word in tweet.split():
        if word not in word2id:
            word2id[word] = i
            id2word[i] = word
            i += 1

# Build bag of words
data = []
rows = []
cols = []
for i in range(len(x_raw)):
    counts = {}
    for word in x_raw[i].split():
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))

x = csr_matrix((data, (rows, cols)))

In [5]:
# Shuffle x and y together
state = np.random.get_state()
i = np.arange(x.shape[0])
np.random.shuffle(i)
x =  x[i, :] # shuffling a sparse matrix is weird
np.random.set_state(state)
np.random.shuffle(y)

In [51]:
from sklearn import metrics

def sigmoid(x):
    s = np.zeros(x.shape)
    s[x > 0] = 1. / (1. + np.exp(-x[x > 0])) # avoid overflow
    s[x <= 0] = np.exp(x[x <= 0]) / (np.exp(x[x <= 0]) + 1) # avoid underflow
    return s

def cost(w, x, y):
    """ The cost function for logistic regression """
    h = sigmoid(x @ w)
    cost = np.average(-y * np.log(h) - (1 - y) * np.log(1 - h))
    return cost

def gradient_update(w, x, y):
    """ The gradient update for logistic regression"""
    h = sigmoid(x @ w)
    g = (h - y) @ x
    g = g / x.shape[0] 
    return g

def gradient_descent(w, x, y, alpha, iterations, print_iterations):
    """ Batch gradient descent algorithm """
    alpha *= x.shape[0]
    for i in range(iterations):
        if i % print_iterations == 0:
            print('%d) cost: %f' % (i, cost(w, x, y)))
        w -= alpha * gradient_update(w, x, y)       
    return w

def predict(w, x):
    """ Predict whether the label is 0 or 1 using learned logistic regression parameters """
    h = x @ w
    probabilities = sigmoid(h)
    predicted = 1 * (h > 0) # converts truth values to 1 or 0
    return probabilities, 1 * predicted

def report(y, y_hat, m=['accuracy', 'precision', 'recall', 'f1']):
    results = []
    if 'accuracy' in m:
        results.append(metrics.accuracy_score(y, y_hat))
    if 'precision' in m:
        results.append(metrics.precision_score(y, y_hat))
    if 'recall' in m:
        results.append(metrics.recall_score(y, y_hat))
    if 'f1' in m:
        results.append(metrics.f1_score(y, y_hat))
    return results

In [75]:
# k-fold cross validation
k = 5
chunk_size = x.shape[0] // k
alpha = 0.00005
iterations = 10000
print_iterations = 1000
total_acc = 0
total_p = 0
total_r = 0
total_f1 = 0
for i in range(k):
    start = time()
    initial_w = np.random.rand(x.shape[1]) - 0.5
    x_train = vstack((x[:i * chunk_size], x[(i + 1) * chunk_size:]))
    y_train = np.concatenate((y[:i * chunk_size], y[(i + 1) * chunk_size:]))
    x_val = x[i * chunk_size:(i + 1) * chunk_size]
    y_val = y[i * chunk_size:(i + 1) * chunk_size]
    w = gradient_descent(initial_w.copy(), x_train, y_train, alpha, iterations, print_iterations)
    _, labels = predict(w, x_train)
    a, p, r, f = report(y_train, labels)
    print('train acc:  %.4f%%' % (a * 100))
    _, labels = predict(w, x_val)
    a, p, r, f = report(y_val, labels)
    total_acc += a
    total_p += p
    total_r += r
    total_f1 += f
    print('val acc: %.4f%%' % (a * 100))
    print('took %.2fs' % (time() - start))
print('average validation accuracy:  %.4f%%' % (total_acc / k))
print('average validation precision: %.4f%%' % (total_p / k))
print('average validation recall:    %.4f%%' % (total_r / k))
print('average validation f1:        %.4f%%' % (total_f1 / k))

0) cost: 0.871537
1000) cost: 0.509568
2000) cost: 0.434725
3000) cost: 0.387201
4000) cost: 0.352640
5000) cost: 0.325592
6000) cost: 0.303499
7000) cost: 0.284921
8000) cost: 0.268967
9000) cost: 0.255045
train acc:  92.7587%
val acc: 72.1299%
took 25.09s
0) cost: 0.959881
1000) cost: 0.507994
2000) cost: 0.434019
3000) cost: 0.386944
4000) cost: 0.352466
5000) cost: 0.325416
6000) cost: 0.303280
7000) cost: 0.284640
8000) cost: 0.268614
9000) cost: 0.254616
train acc:  92.9569%
val acc: 71.7145%
took 25.17s
0) cost: 0.825470
1000) cost: 0.508856
2000) cost: 0.434320
3000) cost: 0.386777
4000) cost: 0.352123
5000) cost: 0.324980
6000) cost: 0.302817
7000) cost: 0.284194
8000) cost: 0.268213
9000) cost: 0.254276
train acc:  92.9286%
val acc: 72.3943%
took 24.16s
0) cost: 1.033680
1000) cost: 0.516967
2000) cost: 0.439761
3000) cost: 0.390963
4000) cost: 0.355320
5000) cost: 0.327794
6000) cost: 0.305348
7000) cost: 0.286468
8000) cost: 0.270254
9000) cost: 0.256107
train acc:  92.5982

In [73]:
predicted = []
for i in range(len(x_raw)):
    offensive = False
    for word in x_raw[i].split():
        if word in bad_words:
            offensive = True
            break
    if offensive:
        predicted.append(1)
    else:
        predicted.append(0)
rule_based_labels = np.array(predicted)

In [74]:
off = 100 * (np.sum(y) / y.shape[0])
not_off = 100 * (1 - (np.sum(y) / y.shape[0]))
print('Offensive\tNot offensive')
print('%.2f%%\t\t%.2f%%' % (off, not_off))
labels = np.zeros(y.shape)
a, p, r, f = report(y, labels)
print('accuracy:  %.4f' % a)
print('precision: %.4f' % p)
print('recall:    %.4f' % r)
print('f1:        %.4f' % f)

a, p, r, f = report(y, rule_based_labels)
print('\nOffensive if tweet contains a bad word:')
print('accuracy:  %.4f' % a)
print('precision: %.4f' % p)
print('recall:    %.4f' % r)
print('f1:        %.4f' % f)

Offensive	Not offensive
33.23%		66.77%
accuracy:  0.6677
precision: 0.0000
recall:    0.0000
f1:        0.0000

Offensive if tweet contains a bad word:
accuracy:  0.7214
precision: 0.7030
recall:    0.2798
f1:        0.4003
