In [3]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.04s


In [4]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

In [7]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [12]:
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1', 'auc', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

    '''
MD  LR    N     F1
3   0.90  100   0.7452

2   0.53  500   0.7510 # explored
2   0.40  1000  0.7513
1   0.63  500   0.7514 # explored
1   0.63  1000  0.7547 # explored
'''

for md in [2]:
    for lr in [i / 100 for i in range(30, 51)]:
        for n in [1000]:
            print('md=%d, lr=%f, n=%d' % (md, lr,n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=2, lr=0.400000, n=500
acc     p       r       f1      auc     time    
0.7583  0.7351  0.4417  0.7393  0.6804  4.60s
0.7711  0.7734  0.4470  0.7515  0.6906  4.61s
0.7666  0.7978  0.4662  0.7481  0.7002  4.82s
0.7779  0.7208  0.4325  0.7590  0.6800  5.07s
0.7719  0.7665  0.4487  0.7527  0.6905  4.56s
0.7711  0.7614  0.4558  0.7528  0.6922  4.56s
0.7772  0.7692  0.4439  0.7575  0.6902  4.93s
0.7598  0.7557  0.4381  0.7397  0.6823  5.03s
0.7681  0.7372  0.4622  0.7513  0.6905  5.40s
0.7674  0.7292  0.4773  0.7523  0.6945  5.08s
average:
0.7690  0.7546  0.4513  0.7504  0.6891  4.87s

md=2, lr=0.410000, n=500
acc     p       r       f1      auc     time    
0.7545  0.7336  0.4260  0.7338  0.6737  5.07s
0.7696  0.7594  0.4560  0.7514  0.6917  5.08s
0.7749  0.8165  0.4789  0.7572  0.7095  5.07s
0.7787  0.7149  0.4450  0.7613  0.6841  5.05s
0.7696  0.7617  0.4442  0.7501  0.6876  5.01s
0.7560  0.7252  0.4308  0.7362  0.6746  4.93s
0.7727  0.7452  0.4509  0.7545  0.6886  4.98s
0.7583  0.7619

0.7530  0.7059  0.4574  0.7368  0.6803  4.49s
0.7659  0.7333  0.4718  0.7501  0.6928  4.45s
0.7666  0.7759  0.4895  0.7511  0.7053  4.42s
0.7742  0.6863  0.4650  0.7598  0.6865  4.47s
0.7636  0.7283  0.4579  0.7466  0.6866  4.44s
0.7628  0.7260  0.4626  0.7464  0.6877  4.42s
0.7734  0.7270  0.4790  0.7587  0.6965  4.46s
0.7545  0.7309  0.4447  0.7360  0.6799  4.43s
0.7689  0.7266  0.4805  0.7542  0.6957  5.17s
0.7538  0.6770  0.4955  0.7424  0.6889  4.45s
average:
0.7637  0.7217  0.4704  0.7482  0.6900  4.52s

md=2, lr=0.550000, n=500
acc     p       r       f1      auc     time    
0.7485  0.6969  0.4484  0.7317  0.6747  4.44s
0.7666  0.7393  0.4673  0.7502  0.6922  4.43s
0.7553  0.7622  0.4599  0.7372  0.6900  4.44s
0.7749  0.6875  0.4675  0.7608  0.6878  4.41s
0.7674  0.7298  0.4738  0.7519  0.6934  4.46s
0.7666  0.7374  0.4649  0.7500  0.6911  4.50s
0.7734  0.7336  0.4696  0.7575  0.6941  4.43s
0.7545  0.7361  0.4381  0.7350  0.6783  4.44s
0.7704  0.7254  0.4897  0.7566  0.6992  4.