In [2]:
import csv
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn import metrics as skmetrics

training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'

np.random.seed(1234) # help reproducibility

start = time()    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y = []
    for r in raw:
        y.append(0 if r[1] == 'NOT' else 1)

with open('predictions/david-lr-val.txt') as f:
    y_dlr = np.array([int(line) for line in f])
    y_dlr = np.expand_dims(y_lr, axis=1)
    
with open('predictions/jp-lr-val.txt') as f:
    y_jlr = np.array([int(line) for line in f])
    y_jlr = np.expand_dims(y_lr, axis=1)
    
with open('predictions/svm-val.txt') as f:
    y_svm = np.array([int(line) for line in f])
    y_svm = np.expand_dims(y_svm, axis=1)
   
with open('predictions/xgb-val.txt') as f:
    y_xgb = np.array([int(line) for line in f])
    y_xgb = np.expand_dims(y_xgb, axis=1)
    
with open('predictions/cnn-val.txt') as f:
    y_cnn = np.array([int(line) for line in f])
    y_cnn = np.expand_dims(y_xgb, axis=1)

# Load hashtag segmentations
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
print('Loaded data in %.2fs' % (time() - start))

def shuffle_together(x, y):
    # Shuffle x and y together
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    return x[i, :], y[i] # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    return results

Loaded data in 0.07s


In [15]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer(preserve_case=False)
class Wrapper:
    def __init__(self, tweet_tk, segmentations):
        self.tweet_tk = tweet_tk
        self.segmentations = segmentations
    
    def tokenize(self, x):
        tokens = []
        for token in self.tweet_tk.tokenize(x):
            if token[0] == '#' and token[1:] in self.segmentations:
                sequence = self.segmentations[token[1:]].split()
            else:
                sequence = [token]

            for word in sequence:
                tokens.append(word)
        return tokens
tk = Wrapper(tokenizer, segmentations)
          
vectorizer = TfidfVectorizer(tokenizer=tk.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )

x = vectorizer.fit_transform(x_raw)
x = csr_matrix(hstack((x, y_lr, y_svm, y_xgb)))

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from scipy.sparse import vstack
from xgboost import XGBClassifier

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, y_train = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))
    
print('XGB')
# 3, 0.003 and 0.012, 100 = 0.7286
# 3, .01, 100  = .7282
# 4, .01, 100  = .7257
for md in [3]:
    for lr in [.012]:
        n = 100
        print('md=%d, lr=%f, n=%d' % (md, lr, n))
        model = XGBClassifier(max_depth=md, learning_rate=lr, n_estimators=n)
        test(model)
        print()

Logistic Regression

SVM

XGB
md=3, lr=0.012000, n=100
acc     p       r       f1w     f1m     time    
0.7628  0.6833  0.5516  0.7557  0.7200  1.65s
0.7742  0.7057  0.5576  0.7666  0.7309  1.68s
0.7742  0.7536  0.5485  0.7643  0.7357  1.69s
0.7742  0.6624  0.5150  0.7652  0.7126  1.66s
0.7772  0.7209  0.5353  0.7674  0.7289  2.01s
0.7847  0.7241  0.5714  0.7774  0.7427  1.90s
0.7847  0.7072  0.5701  0.7780  0.7397  1.79s
0.7674  0.7222  0.5177  0.7561  0.7193  1.72s
0.7689  0.6877  0.5492  0.7614  0.7232  1.74s
0.7734  0.6902  0.5773  0.7678  0.7328  1.61s
average:
0.7742  0.7057  0.5494  0.7660  0.7286  1.75s

