In [1]:
import csv
import itertools
from time import time

import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix, vstack, hstack
from xgboost import XGBClassifier

training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'

np.random.seed(1234) # help reproducibility

start = time()   
with open(training_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_train_raw = []
    y_train = []
    for r in raw:
        x_train_raw.append(r[1])
        y_train.append(0 if r[2] == 'NOT' else 1)
    x_train_raw = x_train_raw[1:]
    y_train = y_train[1:]
    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y_test = []
    for r in raw:
        y_test.append(0 if r[1] == 'NOT' else 1)

with open('predictions/david-lr-train.txt') as f:
    y_dlr_train = np.array([int(line) for line in f])
    y_dlr_train = np.expand_dims(y_dlr_train, axis=1)        
with open('predictions/david-lr-val.txt') as f:
    y_dlr_test = np.array([int(line) for line in f])
    y_dlr_test = np.expand_dims(y_dlr_test, axis=1)

with open('predictions/jp-lr-train.txt') as f:
    y_jlr_train = np.array([int(line) for line in f])
    y_jlr_train = np.expand_dims(y_jlr_train, axis=1)
with open('predictions/jp-lr-val.txt') as f:
    y_jlr_test = np.array([int(line) for line in f])
    y_jlr_test = np.expand_dims(y_jlr_test, axis=1)
    
with open('predictions/svm-train.txt') as f:
    y_svm_train = np.array([int(line) for line in f])
    y_svm_train = np.expand_dims(y_svm_train, axis=1)   
with open('predictions/svm-val.txt') as f:
    y_svm_test = np.array([int(line) for line in f])
    y_svm_test = np.expand_dims(y_svm_test, axis=1)

with open('predictions/xgb-train.txt') as f:
    y_xgb_train = np.array([int(line) for line in f])
    y_xgb_train = np.expand_dims(y_xgb_train, axis=1)
with open('predictions/xgb-val.txt') as f:
    y_xgb_test = np.array([int(line) for line in f])
    y_xgb_test = np.expand_dims(y_xgb_test, axis=1)
    
with open('predictions/cnn-val.txt') as f:
    y_cnn_test = np.array([int(line) for line in f])
    y_cnn_test = np.expand_dims(y_cnn_test, axis=1)
    
with open('predictions/best-bigru.txt') as f:
    y_gru_test = np.array([int(line) for line in f])
    y_gru_test = np.expand_dims(y_gru_test, axis=1)

# Load hashtag segmentations
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.15s


In [2]:
def shuffle_together(x, y):
    # Shuffle x and y together
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    return x[i, :], y[i] # shuffling a sparse matrix is weird

def report(y, y_hat, verbose=False, plot=False, title=None):
    results = [
        metrics.accuracy_score(y, y_hat),
        metrics.precision_score(y, y_hat),
        metrics.recall_score(y, y_hat),
        metrics.f1_score(y, y_hat, average='weighted'),
        metrics.f1_score(y, y_hat, average='macro')
    ]
    if verbose:
        print(5 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m'))
        print('%.4f  %.4f  %.4f  %.4f  %.4f\n' % tuple(results))
    
    if plot:
        cm = metrics.confusion_matrix(y, y_hat)
        np.set_printoptions(precision=2)
        plt.figure()
        plot_confusion_matrix(cm, classes=('Offensive', 'Inoffensive'), title=title)
        plt.savefig('graphs/' + title + '.png', bbox_inches='tight', pad_inches=0.4)
        plt.show()
        
    return results
                                         
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    
    https://stackoverflow.com/questions/48817300/sklearn-plot-confusion-matrix-combined-across-trainingtest-sets
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes, rotation=45)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [3]:
start = time()
tokenizer = TweetTokenizer(preserve_case=False)
class Wrapper:
    def __init__(self, tweet_tk, segmentations):
        self.tweet_tk = tweet_tk
        self.segmentations = segmentations
    
    def tokenize(self, x):
        tokens = []
        for token in self.tweet_tk.tokenize(x):
            if token[0] == '#' and token[1:] in self.segmentations:
                sequence = self.segmentations[token[1:]].split()
            else:
                sequence = [token]

            for word in sequence:
                tokens.append(word)
        return tokens
tk = Wrapper(tokenizer, segmentations)
          
vectorizer = TfidfVectorizer(tokenizer=tk.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )

x_train = vectorizer.fit_transform(x_train_raw)
x_test = vectorizer.transform(x_test_raw)
y_train = np.array(y_train)
y_test = np.array(y_test)
supplement_names = np.array(['jlr', 'svm', 'xgb', 'dlr', 'cnn', 'gru'])
training_supplements = np.array([y_jlr_train, y_svm_train, y_xgb_train, y_dlr_train])
testing_supplements = np.array([y_jlr_test, y_svm_test, y_xgb_test, y_dlr_test, y_cnn_test, y_gru_test])
print('Done preprocessing in %.2fs.' % (time() - start))

Done preprocessing in 2.37s.


In [None]:
def test(model, x, y):
    k = 10
    kf = KFold(n_splits=k)
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

N = len(training_supplements)
for r in range(1, N+1):
    for combo in itertools.combinations(range(N), r):
        i = list(combo)
        training = csr_matrix(hstack((x_train,) + tuple(training_supplements[i])))
        print(supplement_names[i])
        model = XGBClassifier(max_depth=3, learning_rate=0.012, n_estimators=100)
        test(model, training, y_train)

['jlr']
acc     p       r       f1w     f1m     time    
0.7485  0.7467  0.3834  0.7219  0.6689  3.15s
0.7711  0.8017  0.4199  0.7476  0.6988  3.16s
0.7613  0.8086  0.4367  0.7393  0.7012  3.25s
0.7606  0.7005  0.3625  0.7338  0.6612  3.13s
0.7613  0.7662  0.4032  0.7368  0.6843  3.10s
0.7727  0.8182  0.4082  0.7473  0.6966  3.28s
0.7885  0.8304  0.4346  0.7662  0.7151  3.34s
0.7576  0.7911  0.3938  0.7309  0.6815  3.25s
0.7795  0.8194  0.4256  0.7563  0.7065  3.20s
0.7606  0.7531  0.4159  0.7380  0.6873  3.26s
average:
0.7662  0.7836  0.4084  0.7418  0.6901  3.21s
['svm']
acc     p       r       f1w     f1m     time    
0.7560  0.7071  0.4709  0.7411  0.6979  3.17s
0.7757  0.7370  0.5124  0.7635  0.7240  3.27s
0.7704  0.7724  0.5084  0.7567  0.7250  3.25s
0.7681  0.6679  0.4625  0.7543  0.6954  3.24s
0.7719  0.7438  0.4761  0.7562  0.7120  3.28s
0.7757  0.7400  0.5034  0.7626  0.7217  3.37s
0.7885  0.7483  0.5210  0.7767  0.7343  3.29s
0.7613  0.7345  0.4712  0.7454  0.7042  3.26s
0.7