In [1]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.11s


In [2]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1-weighted' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'f1-macro' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    return results

In [3]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [4]:
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1-w', 'f1-m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

    '''
MD  LR    N     F1-w    F1-m
3   0.90  100   0.7452
2   0.53  500   0.7510         # explored
2   0.40  1000  0.7513  0.7075 # explored
1   0.63  500   0.7514  0.7047 # explored
1   0.63  1000  0.7547  0.7095 # explored
1    .39  2000           .7086 # explored
'''

for md in [1]:
    for lr in [i / 100 for i in range(31, 61)]:
        for n in [2000]:
            print('md=%d, lr=%f, n=%d' % (md, lr,n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=1, lr=0.310000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7591  0.7292  0.4529  0.7415  0.6965  25.15s
0.7659  0.7491  0.4515  0.7475  0.7017  25.54s
0.7779  0.8169  0.4895  0.7613  0.7283  25.23s
0.7810  0.7148  0.4575  0.7648  0.7062  25.04s
0.7711  0.7519  0.4624  0.7538  0.7082  24.95s
0.7613  0.7323  0.4467  0.7430  0.6959  24.81s
0.7727  0.7378  0.4603  0.7557  0.7064  24.72s
0.7576  0.7435  0.4425  0.7383  0.6941  25.30s
0.7666  0.7222  0.4760  0.7517  0.7066  25.22s
0.7606  0.7085  0.4750  0.7460  0.7015  25.59s
average:
0.7674  0.7406  0.4614  0.7504  0.7045  25.16s

md=1, lr=0.320000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7606  0.7312  0.4574  0.7434  0.6990  25.54s
0.7659  0.7454  0.4560  0.7481  0.7028  25.41s
0.7779  0.8125  0.4937  0.7618  0.7291  25.26s
0.7787  0.7034  0.4625  0.7635  0.7052  25.19s
0.7696  0.7428  0.4670  0.7531  0.7078  25.60s
0.7636  0.7370  0.4512  0.7456  0.6991  25.17s
0.7779  0.7481  0.4720  0.7618  0.7140  2

0.7659  0.7483  0.4735  0.7497  0.7088  26.23s
0.7644  0.7023  0.4966  0.7521  0.7089  25.45s
0.7613  0.6987  0.4955  0.7491  0.7066  25.64s
average:
0.7654  0.7209  0.4799  0.7509  0.7068  25.93s

md=1, lr=0.450000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7568  0.7039  0.4798  0.7429  0.7005  25.53s
0.7606  0.7143  0.4740  0.7457  0.7020  25.30s
0.7749  0.7973  0.4979  0.7596  0.7272  25.60s
0.7711  0.6714  0.4750  0.7584  0.7011  25.44s
0.7651  0.7270  0.4670  0.7491  0.7036  25.06s
0.7659  0.7220  0.4830  0.7516  0.7083  25.46s
0.7704  0.7263  0.4650  0.7543  0.7054  25.89s
0.7628  0.7430  0.4668  0.7462  0.7046  25.79s
0.7659  0.7055  0.4989  0.7537  0.7107  25.30s
0.7591  0.6958  0.4886  0.7463  0.7031  25.37s
average:
0.7653  0.7206  0.4796  0.7508  0.7066  25.47s

md=1, lr=0.460000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7636  0.7195  0.4888  0.7499  0.7086  25.41s
0.7576  0.7103  0.4650  0.7419  0.6972  25.36s
0.7742  0.8007  0.4916  0.7581 

0.7681  0.7720  0.5000  0.7537  0.7212  25.90s
0.7711  0.6667  0.4850  0.7595  0.7033  26.06s
0.7651  0.7162  0.4829  0.7510  0.7071  25.25s
0.7681  0.7248  0.4898  0.7544  0.7119  25.84s
0.7621  0.6840  0.4907  0.7500  0.7034  25.43s
0.7553  0.7119  0.4757  0.7406  0.6996  25.38s
0.7644  0.6923  0.5149  0.7540  0.7126  24.95s
0.7591  0.6764  0.5273  0.7504  0.7108  25.49s
average:
0.7608  0.6998  0.4913  0.7482  0.7050  25.53s

md=1, lr=0.590000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7515  0.6893  0.4776  0.7380  0.6952  25.44s
0.7545  0.6916  0.4808  0.7412  0.6980  25.37s
0.7689  0.7745  0.5000  0.7544  0.7219  25.01s
0.7704  0.6633  0.4875  0.7591  0.7032  25.71s
0.7674  0.7251  0.4806  0.7528  0.7087  25.33s
0.7621  0.7059  0.4898  0.7490  0.7063  25.71s
0.7636  0.6885  0.4907  0.7514  0.7048  25.75s
0.7576  0.7162  0.4801  0.7432  0.7026  25.17s
0.7613  0.6897  0.5034  0.7501  0.7075  25.86s
0.7560  0.6736  0.5159  0.7466  0.7058  26.19s
average:
0.7613  0.701