Misspellings of curse words?  

When building vocab, consider punctuation. Tokenize it first? Right now it just splits on whitespace. Moses probably has something for this

In [7]:
from time import time
import csv
import numpy as np
from scipy.sparse import csr_matrix, vstack
data = 'data/kaggle/train.csv'
np.random.seed(1234) # help reproducibility

In [8]:
# y == 0 if not offensive
# y == 1 if offensive
%time
with open(data) as f:
    raw = csv.reader(f, delimiter=',')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if all(x == '0' for x in r[2:]) else 1)
    
y = np.array(y)
    

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 37.2 µs


In [9]:
# Build vocab


id2word = {}
word2id = {}
i = 0 # index of unique word
for tweet in x_raw:
    for word in tweet.split():
        if word not in word2id:
            word2id[word] = i
            id2word[i] = word
            i += 1

# Build bag of words
data = []
rows = []
cols = []
for i in range(1,len(x_raw)):
    counts = {}
    for word in x_raw[i].split():
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))
        
x = csr_matrix((data, (rows, cols)))

In [10]:
# Shuffle x and y together
state = np.random.get_state()
i = np.arange(x.shape[0])
np.random.shuffle(i)
x =  x[i, :] # shuffling a sparse matrix is weird
np.random.set_state(state)
np.random.shuffle(y)

In [14]:
def sigmoid(x):
    s = np.zeros(x.shape)
    s[x > 0] = 1. / (1. + np.exp(-x[x > 0])) # avoid overflow
    s[x <= 0] = np.exp(x[x <= 0]) / (np.exp(x[x <= 0]) + 1) # avoid underflow
    return s

def cost(w, x, y):
    """ The cost function for logistic regression """
    h = sigmoid(x @ w)
    print(w)
    cost = np.average(-y * np.log(h) - (1 - y) * np.log(1 - h))
    return cost

def gradient_update(w, x, y):
    """ The gradient update for logistic regression"""
    h = sigmoid(x @ w)
    g = (h - y) @ x
    
    g = g / x.shape[0] 
    return g

def gradient_descent(w, x, y, alpha, iterations, print_iterations):
    """ Batch gradient descent algorithm """
    alpha *= x.shape[0]
    for i in range(iterations):
        if i % print_iterations == 0:
            print('%d) cost: %f' % (i, cost(w, x, y)))
        w -= alpha * gradient_update(w, x, y)       
    return w

def predict(w, x):
    """ Predict whether the label is 0 or 1 using learned logistic regression parameters """
    h = x @ w
    probabilities = sigmoid(h)
    predicted = 1 * (h > 0) # converts truth values to 1 or 0
    return probabilities, 1 * predicted

In [12]:
initial_w = np.random.rand(x.shape[1]) - 0.5
alpha = 0.00005
iterations = 10
print_iterations = 1
start = time()
w = gradient_descent(initial_w, x, y, alpha, iterations, print_iterations)
print('took %.2fs' % (time() - start))

[0.92707452 0.77144196 0.20949831 ... 0.86670545 0.26233598 0.99999999]
0) cost: nan


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


[9.03310046e-73 1.07755271e-06 8.87879633e-27 ... 6.72634937e-33
 6.60986381e-05 0.00000000e+00]
1) cost: nan
[3.45679246e-66 1.36610335e-05 9.16844906e-24 ... 3.32973114e-30
 1.97331733e-04 0.00000000e+00]
2) cost: nan
[1.30710980e-59 1.71419615e-04 9.34466544e-21 ... 1.63144411e-27
 5.83133068e-04 0.00000000e+00]
3) cost: nan
[4.84402646e-53 2.10720149e-03 9.33310513e-18 ... 7.84327374e-25
 1.69090901e-03 0.00000000e+00]
4) cost: nan
[1.72198074e-46 2.44643997e-02 8.94550575e-15 ... 3.63804295e-22
 4.73026278e-03 0.00000000e+00]
5) cost: nan
[4.86180403e-40 2.11429676e-01 7.26391620e-12 ... 1.58237998e-19
 1.23996069e-02 0.00000000e+00]
6) cost: nan
[8.71239914e-34 7.00745680e-01 4.38999672e-09 ... 6.13706967e-17
 2.90457157e-02 0.00000000e+00]
7) cost: nan
[8.52010257e-28 9.38916661e-01 1.60635467e-06 ... 1.88205374e-14
 5.70834780e-02 0.00000000e+00]
8) cost: nan
[2.84690057e-022 9.85406063e-001 2.65563499e-004 ... 3.67244343e-012
 8.94508431e-002 1.15206219e-294]
9) cost: nan
took

In [8]:
prob, labels = predict(w, x)
acc = labels[np.where(labels == y)].size / float(y.size) * 100
print('accuracy: %.4f%%' % acc)

accuracy: 92.4924%


In [9]:
# k-fold cross validation
k = 5
chunk_size = x.shape[0] // k
alpha = 0.0005
iterations = 1000
print_iterations = 1000
total_acc = 0
for i in range(k):
    start = time()
    initial_w = np.random.rand(x.shape[1]) - 0.5
    x_train = vstack((x[:i * chunk_size], x[(i + 1) * chunk_size:]))
    y_train = np.concatenate((y[:i * chunk_size], y[(i + 1) * chunk_size:]))
    x_val = x[i * chunk_size:(i + 1) * chunk_size]
    y_val = y[i * chunk_size:(i + 1) * chunk_size]
    w = gradient_descent(initial_w.copy(), x_train, y_train, alpha, iterations, print_iterations)
    _, labels = predict(w, x_train)
    train_acc = labels[np.where(labels == y_train)].size / float(y_train.size) * 100
    print('train acc: %.4f%%' % train_acc)
    _, labels = predict(w, x_val)
    val_acc = labels[np.where(labels == y_val)].size / float(y_val.size) * 100
    total_acc += val_acc
    print('val acc: %.4f%%' % val_acc)
    print('took %.2fs' % (time() - start))
print('avg acc: %.4f%%' % (total_acc / k))

0) cost: 1.399906
train acc: 83.1665%
val acc: 70.5438%
took 2.53s
0) cost: 0.921110
train acc: 84.9509%
val acc: 72.2432%
took 2.54s
0) cost: 0.982554
train acc: 69.9301%
val acc: 69.6752%
took 2.37s
0) cost: 0.827201
train acc: 69.7319%
val acc: 52.7190%
took 2.35s
0) cost: 0.951568
train acc: 70.3550%
val acc: 68.1647%
took 2.35s
avg acc: 66.6692%
