Misspellings of curse words?  

When building vocab, consider punctuation. Tokenize it first? Right now it just splits on whitespace. Moses probably has something for this

In [1]:
from time import time
from nltk.tokenize import TweetTokenizer
import numpy as np
from scipy.sparse import csr_matrix, vstack
import pandas as pd

np.random.seed(1234) # help reproducibility

In [13]:
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
# read in data using pandas
data = pd.read_csv('data/OLIDv1.0/olid-training-v1.0.tsv', '\t')
X_raw = data['tweet'].values
X = [tknzr.tokenize(x) for x in X_raw]
Y_raw = data['subtask_a'].values
y = np.zeros(len(Y_raw))
y[np.where(Y_raw == 'OFF')] = 1.
print(X[:5])

[['she', 'should', 'ask', 'a', 'few', 'native', 'americans', 'what', 'their', 'take', 'on', 'this', 'is', '.'], ['go', 'home', 'you', '’', 're', 'drunk', '!', '!', '!', '#maga', '#trump2020', '👊', '🇺', '🇸', '👊', 'url'], ['amazon', 'is', 'investigating', 'chinese', 'employees', 'who', 'are', 'selling', 'internal', 'data', 'to', 'third-party', 'sellers', 'looking', 'for', 'an', 'edge', 'in', 'the', 'competitive', 'marketplace', '.', 'url', '#amazon', '#maga', '#kag', '#china', '#tcot'], ['someone', "should'vetaken", '"', 'this', 'piece', 'of', 'shit', 'to', 'a', 'volcano', '.', '😂', '"'], ['obama', 'wanted', 'liberals', '&', 'illegals', 'to', 'move', 'into', 'red', 'states']]


In [14]:
# Build vocab
id2word = {}
word2id = {}
i = 0 # index of unique word
for tweet in X:
    for word in tweet:
        if word not in word2id:
            word2id[word] = i
            id2word[i] = word
            i += 1

# Build bag of words
data = []
rows = []
cols = []
for i in range(len(X)):
    counts = {}
    for word in X[i]:
        if word2id[word] not in counts:
            counts[word2id[word]] = 1
        else:
            counts[word2id[word]] += 1
            
    for word_id, freq in counts.items():
        data.append(freq)
        rows.append(i)
        cols.append(word_id)
        
    # bias
    data.append(1)
    rows.append(i)
    cols.append(len(word2id))
        
x = csr_matrix((data, (rows, cols)))

In [15]:
# Shuffle x and y together
state = np.random.get_state()
i = np.arange(x.shape[0])
np.random.shuffle(i)
x =  x[i, :] # shuffling a sparse matrix is weird
np.random.set_state(state)
np.random.shuffle(y)

In [16]:
def sigmoid(x):
    s = np.zeros(x.shape)
    s[x > 0] = 1. / (1. + np.exp(-x[x > 0])) # avoid overflow
    s[x <= 0] = np.exp(x[x <= 0]) / (np.exp(x[x <= 0]) + 1) # avoid underflow
    return s

def cost(w, x, y):
    """ The cost function for logistic regression """
    h = sigmoid(x @ w)
    cost = np.average(-y * np.log(h) - (1 - y) * np.log(1 - h))
    return cost

def gradient_update(w, x, y):
    """ The gradient update for logistic regression"""
    h = sigmoid(x @ w)
    g = (h - y) @ x
    g = g / x.shape[0] 
    return g

def gradient_descent(w, x, y, alpha, iterations, print_iterations):
    """ Batch gradient descent algorithm """
    alpha *= x.shape[0]
    for i in range(iterations):
        if i % print_iterations == 0:
            print('%d) cost: %f' % (i, cost(w, x, y)))
        w -= alpha * gradient_update(w, x, y)       
    return w

def predict(w, x):
    """ Predict whether the label is 0 or 1 using learned logistic regression parameters """
    h = x @ w
    probabilities = sigmoid(h)
    predicted = 1 * (h > 0) # converts truth values to 1 or 0
    return probabilities, 1 * predicted

In [17]:
initial_w = np.random.rand(x.shape[1]) - 0.5
alpha = 0.00005
iterations = 10000
print_iterations = 1000
start = time()
w = gradient_descent(initial_w, x, y, alpha, iterations, print_iterations)
print('took %.2fs' % (time() - start))

0) cost: 0.980257
1000) cost: 0.476621
2000) cost: 0.417246
3000) cost: 0.381414
4000) cost: 0.355580
5000) cost: 0.335371
6000) cost: 0.318786
7000) cost: 0.304740
8000) cost: 0.292577
9000) cost: 0.281868
took 30.56s


In [18]:
prob, labels = predict(w, x)
acc = labels[np.where(labels == y)].size / float(y.size) * 100
print('accuracy: %.4f%%' % acc)

accuracy: 89.9245%


In [19]:
# k-fold cross validation
k = 5
chunk_size = x.shape[0] // k
alpha = 0.00005
iterations = 10000
print_iterations = 1000
total_acc = 0
for i in range(k):
    start = time()
    initial_w = np.random.rand(x.shape[1]) - 0.5
    x_train = vstack((x[:i * chunk_size], x[(i + 1) * chunk_size:]))
    y_train = np.concatenate((y[:i * chunk_size], y[(i + 1) * chunk_size:]))
    x_val = x[i * chunk_size:(i + 1) * chunk_size]
    y_val = y[i * chunk_size:(i + 1) * chunk_size]
    w = gradient_descent(initial_w.copy(), x_train, y_train, alpha, iterations, print_iterations)
    _, labels = predict(w, x_train)
    train_acc = labels[np.where(labels == y_train)].size / float(y_train.size) * 100
    print('train acc: %.4f%%' % train_acc)
    _, labels = predict(w, x_val)
    val_acc = labels[np.where(labels == y_val)].size / float(y_val.size) * 100
    total_acc += val_acc
    print('val acc: %.4f%%' % val_acc)
    print('took %.2fs' % (time() - start))
print('avg acc: %.4f%%' % (total_acc / k))

0) cost: 0.912833
1000) cost: 0.486686
2000) cost: 0.422745
3000) cost: 0.384108
4000) cost: 0.356291
5000) cost: 0.334582
6000) cost: 0.316818
7000) cost: 0.301819
8000) cost: 0.288869
9000) cost: 0.277499
train acc: 90.5778%
val acc: 75.5287%
took 22.36s
0) cost: 0.816793
1000) cost: 0.484767
2000) cost: 0.420443
3000) cost: 0.381815
4000) cost: 0.354089
5000) cost: 0.332492
6000) cost: 0.314837
7000) cost: 0.299934
8000) cost: 0.287065
9000) cost: 0.275765
train acc: 90.8421%
val acc: 76.6239%
took 23.39s
0) cost: 0.902031
1000) cost: 0.479834
2000) cost: 0.415932
3000) cost: 0.377474
4000) cost: 0.349852
5000) cost: 0.328334
6000) cost: 0.310757
7000) cost: 0.295942
8000) cost: 0.283170
9000) cost: 0.271969
train acc: 90.8610%
val acc: 75.3776%
took 24.81s
0) cost: 0.936558
1000) cost: 0.479576
2000) cost: 0.415974
3000) cost: 0.377646
4000) cost: 0.350050
5000) cost: 0.328479
6000) cost: 0.310803
7000) cost: 0.295866
8000) cost: 0.282963
9000) cost: 0.271635
train acc: 90.9649%
va