In [18]:
import csv
import itertools
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn import metrics as skmetrics

training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'

np.random.seed(1234) # help reproducibility

start = time()   
with open(training_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_train_raw = []
    y_train = []
    for r in raw:
        x_train_raw.append(r[1])
        y_train.append(0 if r[2] == 'NOT' else 1)
    x_train_raw = x_train_raw[1:]
    y_train = y_train[1:]
    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y_test = []
    for r in raw:
        y_test.append(0 if r[1] == 'NOT' else 1)

with open('predictions/david-lr-val.txt') as f:
    y_dlr_test = np.array([int(line) for line in f])
    y_dlr_test = np.expand_dims(y_dlr_test, axis=1)

with open('predictions/jp-lr-train.txt') as f:
    y_jlr_train = np.array([int(line) for line in f])
    y_jlr_train = np.expand_dims(y_jlr_train, axis=1)
with open('predictions/jp-lr-val.txt') as f:
    y_jlr_test = np.array([int(line) for line in f])
    y_jlr_test = np.expand_dims(y_jlr_test, axis=1)
    
with open('predictions/svm-train.txt') as f:
    y_svm_train = np.array([int(line) for line in f])
    y_svm_train = np.expand_dims(y_svm_train, axis=1)   
with open('predictions/svm-val.txt') as f:
    y_svm_test = np.array([int(line) for line in f])
    y_svm_test = np.expand_dims(y_svm_test, axis=1)

with open('predictions/xgb-train.txt') as f:
    y_xgb_train = np.array([int(line) for line in f])
    y_xgb_train = np.expand_dims(y_xgb_train, axis=1)
with open('predictions/xgb-val.txt') as f:
    y_xgb_test = np.array([int(line) for line in f])
    y_xgb_test = np.expand_dims(y_xgb_test, axis=1)
    
with open('predictions/cnn-val.txt') as f:
    y_cnn_test = np.array([int(line) for line in f])
    y_cnn_test = np.expand_dims(y_cnn_test, axis=1)
    
with open('predictions/best-bigru.txt') as f:
    y_gru_test = np.array([int(line) for line in f])
    y_gru_test = np.expand_dims(y_gru_test, axis=1)

# Load hashtag segmentations
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
print('Loaded data in %.2fs' % (time() - start))

def report(y, y_hat, title):
    results = [
        metrics.accuracy_score(y, y_hat),
        metrics.precision_score(y, y_hat),
        metrics.recall_score(y, y_hat),
        metrics.f1_score(y, y_hat, average='weighted'),
        metrics.f1_score(y, y_hat, average='macro')
    ]
    print(5 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m'))
    print('%.4f  %.4f  %.4f  %.4f  %.4f\n' % tuple(results))
    
    cm = metrics.confusion_matrix(y, y_hat)
    np.set_printoptions(precision=2)
    plt.figure()
    plot_confusion_matrix(cm, classes=('Offensive', 'Inoffensive'), title=title)
    plt.savefig('graphs/' + title + '.png', bbox_inches='tight', pad_inches=0.4)
    plt.show()
                                         
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    
    https://stackoverflow.com/questions/48817300/sklearn-plot-confusion-matrix-combined-across-trainingtest-sets
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes, rotation=45)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Loaded data in 0.22s


In [29]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer(preserve_case=False)
class Wrapper:
    def __init__(self, tweet_tk, segmentations):
        self.tweet_tk = tweet_tk
        self.segmentations = segmentations
    
    def tokenize(self, x):
        tokens = []
        for token in self.tweet_tk.tokenize(x):
            if token[0] == '#' and token[1:] in self.segmentations:
                sequence = self.segmentations[token[1:]].split()
            else:
                sequence = [token]

            for word in sequence:
                tokens.append(word)
        return tokens
tk = Wrapper(tokenizer, segmentations)
          
vectorizer = TfidfVectorizer(tokenizer=tk.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )

x_train = vectorizer.fit_transform(x_train_raw)
x_test = vectorizer.transform(x_test_raw)
training_supplements = np.array([y_jlr_train, y_svm_train, y_xgb_train])
testing_supplements = np.array([y_jlr_test, y_svm_test, y_xgb_test, y_dlr_test, y_cnn_test, y_gru_test])
# x_train = csr_matrix(hstack((x, y_dlr_train, y_jlr_train, y_svm_train, y_xgb)))
print('Done preprocessing')

Done preprocessing


In [None]:
from scipy.sparse import vstack
from xgboost import XGBClassifier

def test(model):
    model.fit(x_train, y_train)
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))


N = len(training_supplements)
for r in range(1, N+1):
    for combo in itertools.combinations(range(N), r):
        i = list(combo)
        training = csr_matrix(hstack((x_train,) + tuple(training_supplements[i])))
        testing = csr_matrix(hstack((x_test,) + tuple(testing_supplements[i])))
        model = XGBClassifier(max_depth=3, learning_rate=0.012, n_estimators=100)
        clf = model.fit(training, y_train)
        y_hat = clf.predict(x_test)
        report(y_train, y_hat, 'XGB Stack')

for md in [3]:
    for lr in [.012]:
        n = 100
        print('md=%d, lr=%f, n=%d' % (md, lr, n))
        
        test(model)
        print()

In [39]:
tr = np.array(['jlr', 'svm', 'xgb'])
te = np.array(['jlr', 'svm', 'xgb', 'dlr', 'cnn', 'gru'])
N = len(training_supplements)
for r in range(1, N+1):
    for combo in itertools.combinations(range(len(training_supplements)), r):
        # print(training_supplements[combo, :].shape)
        i = list(combo)
        print(training_supplements[i].shape)
        print(te[i])

(1, 13240, 1)
['jlr']
(1, 13240, 1)
['svm']
(1, 13240, 1)
['xgb']
(2, 13240, 1)
['jlr' 'svm']
(2, 13240, 1)
['jlr' 'xgb']
(2, 13240, 1)
['svm' 'xgb']
(3, 13240, 1)
['jlr' 'svm' 'xgb']


In [40]:
(1,) + [2, 3]

TypeError: can only concatenate tuple (not "list") to tuple