In [3]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data) as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.09s


In [4]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [5]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [6]:
from sklearn.model_selection import KFold

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1', 'auc', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

# md=3, lr=0.9, n=100 => 0.7452
# md=3, lr=,    n=500 =>
for md in [3]:
    for lr in [0.9]:
        for n in [100]:
            print('md=%d, lr=%f, n=%d' % (md, lr,n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=3, lr=0.900000, n=100
acc     p       r       f1      auc     time    
0.7515  0.6906  0.4753  0.6947  0.6836  2.97s
0.7621  0.7207  0.4718  0.7029  0.6899  2.97s
0.7598  0.7453  0.5000  0.7136  0.7024  2.89s
0.7636  0.6582  0.4525  0.6888  0.6754  2.98s
0.7621  0.7183  0.4647  0.7003  0.6871  3.03s
0.7591  0.6943  0.4943  0.7045  0.6928  3.04s
0.7644  0.7042  0.4673  0.7003  0.6868  3.07s
0.7492  0.7041  0.4580  0.6902  0.6791  2.86s
0.7689  0.7162  0.4966  0.7131  0.6998  3.13s
0.7515  0.6708  0.4955  0.6976  0.6872  2.98s
average:
0.7592  0.7023  0.4776  0.7006  0.6884  2.99s

