In [32]:
import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
kaggle_data_folder = 'data/jigsaw/'
bad_words_data = 'data/trimmed-bad-words.txt'
glove_data = 'data/glove.twitter.27B/glove.twitter.27B.25d.txt' # 25, 50, 100, or 200 D

np.random.seed(1234) # help reproducibility

In [2]:
# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data) as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])

with open(kaggle_data_folder + 'train.csv') as f:  
    raw = csv.reader(f, delimiter=',')
    kaggle_x_raw = []
    kaggle_y = []
    for r in raw:
        kaggle_x_raw.append(r[1])
        kaggle_y.append(0 if all(x == '0' for x in r[2:]) else 1)
    kaggle_x_raw = kaggle_x_raw[1:]
    kaggle_y = np.array(kaggle_y[1:])
     
with open(bad_words_data) as f:
    bad_words = [row[:-1] for row in f.readlines()[1:]]

print('Loaded data in %.2fs' % (time() - start))

Loaded data in 1.92s


In [5]:
start = time()
glove = {}
with open(glove_data) as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        glove[r[0]] = np.array([float(v) for v in r[1:]])
print('Loaded GloVe in %.2fs' % (time() - start))
# On my mac, loads 25D in 30s, 50D in 100s, 100D in 630s

Loaded GloVe in 25.22s


In [46]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [154]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()
          
def bow():
    # Build vocabulary from OLID data only
    vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                                 strip_accents='unicode', 
                                 lowercase=True,
                                 sublinear_tf=True,
                                 min_df=9,
                                 stop_words='english'
                                )
    return vectorizer.fit_transform(x_raw)# + kaggle_x_raw)

def sum_glove():
    x = []
    embedding = np.zeros(glove['.'].shape)
    for tweet in x_raw:
        tokens = tokenizer.tokenize(tweet)
        for word in tokens:
            if word in glove:
                embedding += glove[word]
        x.append(embedding)#/ len(tokens))
    x = np.array(x)
    x = x - np.min(x, axis=1).reshape(x.shape[0], 1)
    x = x / np.max(x, axis=1).reshape(x.shape[0], 1)
    return np.array(x)

x = bow()
# kaggle_x = x[len(x_raw):]
# x = x[:len(x_raw)]

In [171]:
from sklearn.model_selection import KFold
from scipy.sparse import vstack

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1', 'auc', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Append kaggle data to training data
        # x_train = vstack((x_train, kaggle_x))
        # y_train = np.concatenate((y_train, kaggle_y))
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

In [189]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

print('Logistic Regression')
test(LogisticRegression(solver='lbfgs', max_iter=300))
print()
print('SVM')
test(SVC(kernel='linear', gamma='auto', C=1.8))

Logistic Regression
acc     p       r       f1      auc     time    
0.7462  0.7371  0.3834  0.7200  0.6570  0.15s
0.7711  0.7991  0.4221  0.7480  0.6844  0.16s
0.7636  0.8259  0.4304  0.7403  0.6899  0.15s
0.7598  0.6971  0.3625  0.7332  0.6472  0.14s
0.7644  0.7773  0.4055  0.7398  0.6739  0.15s
0.7734  0.8249  0.4059  0.7476  0.6814  0.21s
0.7795  0.8178  0.4089  0.7545  0.6827  0.12s
0.7598  0.8018  0.3938  0.7328  0.6717  0.12s
0.7719  0.8000  0.4119  0.7476  0.6806  0.17s
0.7613  0.7541  0.4182  0.7390  0.6752  0.20s
average:
0.7651  0.7835  0.4043  0.7403  0.6744  0.16s

SVM
acc     p       r       f1      auc     time    
0.7576  0.7076  0.4776  0.7433  0.6887  12.31s
0.7757  0.7310  0.5214  0.7645  0.7125  12.04s
0.7681  0.7702  0.5021  0.7540  0.7093  12.42s
0.7681  0.6729  0.4525  0.7531  0.6786  12.35s
0.7711  0.7446  0.4715  0.7550  0.6957  12.63s
0.7772  0.7417  0.5079  0.7645  0.7098  13.08s
0.7802  0.7354  0.5000  0.7671  0.7070  21.12s
0.7659  0.7432  0.4801  0.7505  0