In [2]:
from xgboost import XGBClassifier

import csv
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'

np.random.seed(1234) # help reproducibility

# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.04s


In [3]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1-weighted' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'f1-macro' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    return results

In [4]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer()  
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )
x = vectorizer.fit_transform(x_raw)

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1-w', 'f1-m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

    '''
MD  LR    N     F1-w    F1-m
3   0.90  100   0.7452
2   0.53  500   0.7510         # explored
2   0.40  1000  0.7513  0.7075 # explored
1   0.63  500   0.7514  0.7047 # explored
1   0.63  1000  0.7547  0.7095 # explored
1   0.4   2000          0.7062
'''

for md in [1]:
    for lr in [i / 100 for i in range(31, 61)]:
        for n in [2000]:
            print('md=%d, lr=%f, n=%d' % (md, lr,n))
            model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
            test(model)
            print()

md=1, lr=0.310000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7591  0.7292  0.4529  0.7415  0.6965  10.27s
0.7659  0.7491  0.4515  0.7475  0.7017  10.13s
0.7779  0.8169  0.4895  0.7613  0.7283  10.26s
0.7810  0.7148  0.4575  0.7648  0.7062  10.20s
0.7711  0.7519  0.4624  0.7538  0.7082  10.14s
0.7613  0.7323  0.4467  0.7430  0.6959  10.17s
0.7727  0.7378  0.4603  0.7557  0.7064  10.17s
0.7576  0.7435  0.4425  0.7383  0.6941  10.21s
0.7666  0.7222  0.4760  0.7517  0.7066  10.17s
0.7606  0.7085  0.4750  0.7460  0.7015  10.17s
average:
0.7674  0.7406  0.4614  0.7504  0.7045  10.19s

md=1, lr=0.320000, n=2000
acc     p       r       f1-w    f1-m    time    
0.7606  0.7312  0.4574  0.7434  0.6990  10.19s
0.7659  0.7454  0.4560  0.7481  0.7028  10.21s
0.7779  0.8125  0.4937  0.7618  0.7291  10.18s
0.7787  0.7034  0.4625  0.7635  0.7052  10.15s
0.7696  0.7428  0.4670  0.7531  0.7078  10.24s
0.7636  0.7370  0.4512  0.7456  0.6991  10.16s
0.7779  0.7481  0.4720  0.7618  0.7140  1