In [None]:
import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from scipy.sparse import csr_matrix, vstack

tokenizer = TweetTokenizer()
olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
kaggle_data = 'data/jigsaw/train.csv'
bad_words_location = 'data/trimmed-bad-words.txt'

np.random.seed(1234) # help reproducibility

In [None]:
# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data) as f:
    raw = [x.strip().split('\t') for x in f.readlines()[1:]]
    x_raw = [tokenizer.tokenize(r[1]) for r in raw]
    y = np.array([0 if r[2] == 'NOT' else 1 for r in raw])
    
with open(kaggle_data) as f:  
    raw = csv.reader(f, delimiter=',')
    kaggle_x_raw = []
    kaggle_y = []
    for r in raw:
        kaggle_x_raw.append(tokenizer.tokenize(r[1]))
        kaggle_y.append(0 if all(x == '0' for x in r[2:]) else 1)
    kaggle_x_raw = kaggle_x_raw[1:]
    kaggle_y = np.array(kaggle_y[1:])
        
with open(bad_words_location) as f:
    bad_words = [row[:-1] for row in f.readlines()[1:]]

print('Loaded data in %.2fs' % (time() - start))

In [None]:
# Build vocab
start = time()
id2word = {}
word2id = {}
i = 0 # index of unique word
for tweet in x_raw: # + kaggle_x_raw:
    for word in tweet:
        if word not in word2id:
            word2id[word] = i
            id2word[i] = word
            i += 1
print('Vocabulary built in %.2fs' % (time() - start))

# Build x bag of words
start = time()
data = []
rows = []
cols = []
for i in range(len(x_raw)):
    counts = {}
    for word in x_raw[i]:
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))

x = csr_matrix((data, (rows, cols)))
# normalize(x, norm='l1', axis=1, copy=False) # normalize by document (TF)
print('x BOW built in %.2fs' % (time() - start))

kagged = '''
# Build kaggle bag of words
start = time()
data = []
rows = []
cols = []
for i in range(len(kaggle_x_raw)):
    counts = {}
    for word in kaggle_x_raw[i]:
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))

kaggle_x = csr_matrix((data, (rows, cols)))
# normalize(kaggle_x, norm='l1', axis=1, copy=False) # normalize by document (TF)
print('kaggle_x BOW built in %.2fs' % (time() - start))
'''

In [None]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

In [None]:
from sklearn import metrics as skmetrics

def sigmoid(x):
    s = np.zeros(x.shape)
    s[x > 0] = 1. / (1. + np.exp(-x[x > 0])) # avoid overflow
    s[x <= 0] = np.exp(x[x <= 0]) / (np.exp(x[x <= 0]) + 1) # avoid underflow
    return s

def cost(w, x, y, L):
    """ The cost function for logistic regression with L2 regularization"""
    h = sigmoid(x @ w)
    ridge = L * np.sum(w**2)
    cost = np.sum(-y * np.log(h) - (1 - y) * np.log(1 - h) + ridge) / y.size
    return cost

def gradient_update(w, x, y, L):
    """ The gradient update for logistic regression with L2 regularization"""
    h = sigmoid(x @ w)
    g = (h - y) @ x + (2 * L * np.sum(w))
    g = g / x.shape[0] 
    return g

def gradient_descent(w, x, y, x_val, y_val, alpha, L, iterations, print_iterations):
    """ Batch gradient descent algorithm with early stopping"""
    best_w = w.copy()
    costs = []
    train_accs = []
    val_accs = []
    best_acc = 0
    alpha *= x.shape[0]
    for i in range(iterations):
        costs.append(cost(w, x, y, L))
        _, y_hat = predict(w, x)
        train_accs.append(y[np.where(y_hat == y)].size / y.size)
        _, y_hat = predict(w, x_val)
        val_accs.append(y_val[np.where(y_hat == y_val)].size / y_val.size)
        
        if i % print_iterations == 0:
            print('%d) cost: %f' % (i, costs[-1]))
        if val_accs[-1] > best_acc:
            best_w = w.copy()
            best_acc = val_accs[-1]
        w -= alpha * gradient_update(w, x, y, L)   
    return best_w, costs, train_accs, val_accs

def predict(w, x):
    """ Predict whether the label is 0 or 1 using learned logistic regression parameters """
    h = x @ w
    probabilities = sigmoid(h)
    predicted = 1 * (h > 0) # converts truth values to 1 or 0
    return probabilities, 1 * predicted

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [None]:
def cross_validate(x, y, k, alpha, L):
    # k-fold cross validation
    chunk_size = x.shape[0] // k
    # alpha = 0.0001
    # L = 0.008
    iterations = 10000
    print_iterations = iterations // 5
    metrics = [[], [], [], [], [], [], [], []] # acc, precision, recall, f1, auc, costs, train_accs, val_accs
    for i in range(k):
        start = time()
        initial_w = np.random.rand(x.shape[1]) - 0.5
        x_train = vstack((x[:i * chunk_size], x[(i + 1) * chunk_size:]))#, kaggle_x[:10000]))
        y_train = np.concatenate((y[:i * chunk_size], y[(i + 1) * chunk_size:]))#, kaggle_y[:10000]))
        x_val = x[i * chunk_size:(i + 1) * chunk_size]
        y_val = y[i * chunk_size:(i + 1) * chunk_size]
        x_train, y_train = shuffle_together(x_train, y_train)
        w, costs, train_accs, val_accs = gradient_descent(initial_w.copy(), x_train, y_train, x_val, y_val, alpha, L, iterations, print_iterations)
        plt.figure()
        plt.yticks([p / 100 for p in range(0, 100, 5)])
        plt.plot([e for e in range(iterations)], train_accs, c='b')
        plt.plot([e for e in range(iterations)], val_accs, c='g')
        plt.ylim((0.5, 0.9))
        plt.show()
        _, labels = predict(w, x_train)
        [a] = report(y_train, labels, metrics=['acc'])
        print('train acc: %.4f%%' % (a * 100))
        _, labels = predict(w, x_val)
        a, p, r, f, auc = report(y_val, labels)
        metrics[0].append(a)
        metrics[1].append(p)
        metrics[2].append(r)
        metrics[3].append(f)
        metrics[4].append(auc)
        metrics[5].append(costs)
        metrics[6].append(train_accs)
        metrics[7].append(val_accs)
        print('val acc:   %.4f%%' % (a * 100))
        print('took %.2fs' % (time() - start))
    #plt.figure()
    #plt.yticks([p / 100 for p in range(0, 100, 5)])
    #plt.plot([e for e in range(iterations)], np.average(metrics[6], axis=0), c='b')
    #plt.plot([e for e in range(iterations)], np.average(metrics[7], axis=0), c='g')
    #plt.ylim((0.5, 0.9))
    #plt.show()
    print('avg val accuracy:   %.4f' % np.average(metrics[0]))
    print('avg val precision:  %.4f' % np.average(metrics[1]))
    print('avg val recall:     %.4f' % np.average(metrics[2]))
    print('avg val f1:         %.4f' % np.average(metrics[3]))
    print('avg val auc:        %.4f' % np.average(metrics[4]))
    return metrics
for L in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
    metrics = cross_validate(x, y, 5, 0.00002, L)
    print(L)

In [None]:
### predicted = []
for i in range(len(x_raw)):
    offensive = False
    for word in x_raw[i].split():
        if word in bad_words:
            offensive = True
            break
    if offensive:
        predicted.append(1)
    else:
        predicted.append(0)
rule_based_labels = np.array(predicted)

In [None]:
off = 100 * (np.sum(y) / y.shape[0])
not_off = 100 * (1 - (np.sum(y) / y.shape[0]))
print('Offensive\tNot offensive')
print('%.2f%%\t\t%.2f%%' % (off, not_off))
labels = np.zeros(y.shape)
a, p, r, f = report(y, labels)
print('accuracy:  %.4f' % a)
print('precision: %.4f' % p)
print('recall:    %.4f' % r)
print('f1:        %.4f' % f)

a, p, r, f = report(y, rule_based_labels)
print('\nOffensive if tweet contains a bad word:')
print('accuracy:  %.4f' % a)
print('precision: %.4f' % p)
print('recall:    %.4f' % r)
print('f1:        %.4f' % f)

In [None]:
off_marks = 0
unoff_marks = 0
for tweet, label in zip(x_raw, y):
    for token in tweet:
        if token == '!':
            if label == 1:
                off_marks += 1
            else:
                unoff_marks += 1
off = np.sum(y)
unoff = y.size - off
print('average ! per offensive tweet: %.4f' % (off_marks / off))
print('average ! per unoffensive tweet: %.4f' % (unoff_marks / unoff))