In [6]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

from google.colab import files

olid_data = 'olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()

with open(olid_data) as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.05s


In [0]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [0]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [9]:
from sklearn.model_selection import KFold

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1', 'auc', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

# md=3, lr=0.9, n=100 => 0.7452
# md=3, lr=,    n=500 =>
for md in [2, 3]:
    for lr in [0.05, 0.1, 0.5, 0.9]:
        for n in [500, 1000]:
            print('md=%d, lr=%f, n=%d' % (md, lr, n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=1, lr=0.050000, n=100
acc     p       r       f1      auc     time    
0.7122  0.8824  0.1682  0.6390  0.5784  0.94s
0.7190  0.8989  0.1806  0.6491  0.5852  0.76s
0.7137  0.9279  0.2173  0.6501  0.6039  0.77s
0.7402  0.8256  0.1775  0.6751  0.5806  0.77s
0.7236  0.9011  0.1868  0.6555  0.5883  0.75s
0.7100  0.8800  0.1497  0.6320  0.5697  0.81s
0.7379  0.9451  0.2009  0.6736  0.5977  0.75s
0.7115  0.9268  0.1681  0.6368  0.5806  0.76s
0.7190  0.8652  0.1762  0.6491  0.5813  0.76s
0.7258  0.9231  0.1909  0.6583  0.5915  0.78s
average:
0.7213  0.8976  0.1816  0.6519  0.5857  0.79s

md=1, lr=0.050000, n=500
acc     p       r       f1      auc     time    
0.7221  0.8197  0.2242  0.6644  0.5996  3.61s
0.7356  0.9115  0.2325  0.6781  0.6106  3.60s
0.7349  0.9128  0.2869  0.6870  0.6358  3.68s
0.7470  0.7982  0.2175  0.6919  0.5968  3.62s
0.7447  0.8797  0.2665  0.6952  0.6242  3.69s
0.7387  0.8926  0.2449  0.6843  0.6151  3.59s
0.7492  0.9211  0.2453  0.6953  0.6176  3.59s
0.7251  0.8929