Misspellings of curse words?  

When building vocab, consider punctuation. Tokenize it first? Right now it just splits on whitespace. Moses probably has something for this

In [1]:
from time import time

import numpy as np
from scipy.sparse import csr_matrix

In [2]:
# y == 0 if not offensive
# y == 1 if offensive
with open('data/OLIDv1.0/olid-training-v1.0.tsv') as f:
    raw = [x.strip().split('\t') for x in f.readlines()[1:]]
    x = [r[1] for r in raw]
    y = np.array([0 if r[2] == 'NOT' else 1 for r in raw])

In [3]:
# Build vocab
id2word = {}
word2id = {}
i = 0 # index of unique word
for tweet in x:
    for word in tweet.split():
        if word not in word2id:
            word2id[word] = i
            id2word[i] = word
            i += 1

In [4]:
# Convert data to bag-of-words
start = time()
data = []
rows = []
cols = []
for i in range(len(x)):
    counts = {}
    for word in x[i].split():
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))
        
x = csr_matrix((data, (rows, cols)))
print('Took %.2fs' % (time() - start))

Took 0.51s


In [5]:
split = 0.5

In [6]:
def sigmoid(x):
    s = np.zeros(x.shape)
    s[x > 0] = 1. / (1. + np.exp(-x[x > 0])) # avoid overflow
    s[x <= 0] = np.exp(x[x <= 0]) / (np.exp(x[x <= 0]) + 1) # avoid underflow
    return s

In [7]:
def cost(w, x, y):
    """ The cost function for logistic regression """
    h = sigmoid(x @ w)
    cost = np.average(-y * np.log(h) - (1 - y) * np.log(1 - h))
    return cost

In [8]:
def gradient_update(w, x, y):
    """ The gradient update for logistic regression"""
    h = sigmoid(x @ w)
    g = (h - y) @ x
    g = g / x.shape[0] 
    return g

In [9]:
def gradient_descent(w, x, y, alpha, iterations, print_iterations):
    """ Batch gradient descent algorithm """
    alpha *= x.shape[0]
    for i in range(iterations):
        if i % print_iterations == 0:
            print('%d) cost: %f' % (i, cost(w, x, y)))
        w -= alpha * gradient_update(w, x, y)       
    return w

In [10]:
initial_w = np.random.rand(x.shape[1]) - 0.5
alpha = 0.00005
iterations = 10000
print_iterations = 1000
w = gradient_descent(initial_w, x, y, alpha, iterations, print_iterations)

0) cost: 0.950716
1000) cost: 0.508338
2000) cost: 0.436565
3000) cost: 0.390796
4000) cost: 0.357150
5000) cost: 0.330639
6000) cost: 0.308850
7000) cost: 0.290423
8000) cost: 0.274512
9000) cost: 0.260565


In [11]:
def predict(w, x):
    """ Predict whether the label is 0 or 1 using learned logistic regression parameters """
    h = x @ w
    probabilities = sigmoid(h)
    predicted = 1 * (h > 0) # converts truth values to 1 or 0
    return probabilities, 1 * predicted

In [13]:
prob, labels = predict(w, x)
acc = labels[np.where(labels == y)].size / float(y.size) * 100
print('accuracy: %.4f%%' % acc)

accuracy: 92.5076%
