In [1]:
import csv
from time import time

import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'

In [2]:
def shuffle_together(x, y):
    # Shuffle x and y together
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    return x[i, :], y[i] # shuffling a sparse matrix is weird

def report(y, y_hat, M=['accuracy', 'precision', 'recall', 'f1w', 'f1m']):
    results = []
    if 'accuracy' in M:
        results.append(metrics.accuracy_score(y, y_hat))
    if 'precision' in M:
        results.append(metrics.precision_score(y, y_hat))
    if 'recall' in M:
        results.append(metrics.recall_score(y, y_hat))
    if 'f1w' in M:
        results.append(metrics.f1_score(y, y_hat, average='weighted'))
    if 'f1m' in M:
        results.append(metrics.f1_score(y, y_hat, average='macro'))
    return results

In [3]:
start = time()
with open(training_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_train_raw = []
    y_train = []
    for r in raw:
        x_train_raw.append(r[1])
        y_train.append(0 if r[2] == 'NOT' else 1)
    x_train_raw = x_train_raw[1:]
    y_train = y_train[1:]
    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y_test = []
    for r in raw:
        y_test.append(0 if r[1] == 'NOT' else 1)
    
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
    
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.08s


In [4]:
tokenizer = TweetTokenizer(preserve_case=False)
class Wrapper:
    def __init__(self, tweet_tk, segmentations):
        self.tweet_tk = tweet_tk
        self.segmentations = segmentations
    
    def tokenize(self, x):
        tokens = []
        for token in self.tweet_tk.tokenize(x):
            if token[0] == '#' and token[1:] in self.segmentations:
                sequence = self.segmentations[token[1:]].split()
            else:
                sequence = [token]

            for word in sequence:
                tokens.append(word)
        return tokens
tk = Wrapper(tokenizer, segmentations)
          
vectorizer = TfidfVectorizer(tokenizer=tk.tokenize, 
                             strip_accents='unicode', 
                             lowercase=True,
                             sublinear_tf=True,
                             min_df=9,
                             stop_words='english'
                            )

x_train = vectorizer.fit_transform(x_train_raw)
x_test = vectorizer.transform(x_test_raw)
y_train = np.array(y_train)
y_test = np.array(y_test)

x_train, y_train = shuffle_together(x_train, y_train)

In [8]:
# Final validation
models = [
    LogisticRegression(solver='lbfgs', max_iter=300),
    SVC(kernel='linear', gamma='auto', C=1.8),
    XGBClassifier(max_depth=3, learning_rate=0.63, n_estimators=1000)
]

for model in models:
    name = str(model).split('(')[0] + '-val.txt'
    print(name)
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m', 'time'))
    start = time()
    clf = model.fit(x_train, y_train)
    y_hat = clf.predict(x_test)
    with open(name, 'w') as results:
        for pred in y_hat:
            results.write('%d\n' % int(pred))
    vals = report(y_test, y_hat) + [time() - start]
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs\n' % tuple(vals))

LogisticRegression-val.txt
acc     p       r       f1w     f1m     time    
0.8128  0.8435  0.4042  0.7884  0.7143  0.15s

SVC-val.txt
acc     p       r       f1w     f1m     time    
0.8081  0.7622  0.4542  0.7908  0.7229  17.18s

XGBClassifier-val.txt
acc     p       r       f1w     f1m     time    
0.8000  0.6789  0.5375  0.7922  0.7333  28.65s

