In [1]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data) as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.10s


In [2]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [3]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [13]:
from sklearn.model_selection import KFold

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1', 'auc', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

# md=3, lr=0.9, n=100 => 0.7452
# md=3, lr=,    n=500 =>
for md in [3]:
    for lr in [0.05, 0.1, 0.5]:
        for n in [500]:
            print('md=%d, lr=%f, n=%d' % (md, lr,n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=3, lr=0.050000, n=500
acc     p       r       f1      auc     time    
0.7424  0.8070  0.3094  0.7025  0.6359  12.16s
0.7523  0.8528  0.3138  0.7120  0.6433  12.29s
0.7545  0.8599  0.3755  0.7231  0.6707  12.11s
0.7689  0.8133  0.3050  0.7301  0.6373  15.03s
0.7530  0.8043  0.3371  0.7180  0.6482  14.17s
0.7523  0.8343  0.3197  0.7136  0.6440  14.95s
0.7674  0.8614  0.3341  0.7309  0.6542  12.70s
0.7379  0.8261  0.2942  0.6944  0.6311  13.76s
0.7477  0.8084  0.3089  0.7080  0.6364  14.45s
0.7553  0.8222  0.3364  0.7197  0.6501  13.37s
average:
0.7532  0.8290  0.3234  0.7152  0.6451  13.50s

md=3, lr=0.100000, n=500
acc     p       r       f1      auc     time    
0.7492  0.7664  0.3677  0.7198  0.6554  12.61s
0.7621  0.8107  0.3770  0.7327  0.6664  12.57s
0.7644  0.8432  0.4198  0.7393  0.6882  13.30s
0.7704  0.7637  0.3475  0.7395  0.6505  12.54s
0.7636  0.8088  0.3759  0.7342  0.6659  12.95s
0.7606  0.8069  0.3696  0.7303  0.6627  12.40s
0.7711  0.8272  0.3692  0.7407  0.6662  12.