In [26]:
import csv
from time import time

import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn import metrics as skmetrics

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
olid_hashtags = 'data/olid_segmentations.tsv'
kaggle_data_folder = 'data/jigsaw/'
bad_words_data = 'data/trimmed-bad-words.txt'
glove_data = 'data/glove.twitter.27B/glove.twitter.27B.25d.txt' # 25, 50, 100, or 200 D

np.random.seed(1234) # help reproducibility

In [27]:
# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])

'''
with open(kaggle_data_folder + 'train.csv', encoding='utf-8') as f:  
    raw = csv.reader(f, delimiter=',')
    kaggle_x_raw = []
    kaggle_y = []
    for r in raw:
        kaggle_x_raw.append(r[1])
        kaggle_y.append(0 if all(x == '0' for x in r[2:]) else 1)
    kaggle_x_raw = kaggle_x_raw[1:]
    kaggle_y = np.array(kaggle_y[1:])
'''

# Load hashtag segmentations
segmentations = {}
for line in open(olid_hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation

print('Loaded data in %.2fs' % (time() - start))

Loaded data in 0.05s


In [28]:
def shuffle_together(x, y):
    # Shuffle x and y together
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    return x[i, :], y[i], i # shuffling a sparse matrix is weird

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='macro'))
    return results

In [30]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer(preserve_case=False)
class Wrapper:
    def __init__(self, tweet_tk, segmentations):
        self.tweet_tk = tweet_tk
        self.segmentations = segmentations
    
    def tokenize(self, x):
        tokens = []
        for token in self.tweet_tk.tokenize(x):
            if token[0] == '#' and token[1:] in self.segmentations:
                sequence = self.segmentations[token[1:]].split()
            else:
                sequence = [token]

            for word in sequence:
                tokens.append(word)
        return tokens
tk = Wrapper(tokenizer, segmentations)
          
def bow():
    # Build vocabulary from OLID data only
    vectorizer = TfidfVectorizer(tokenizer=tk.tokenize, 
                                 strip_accents='unicode', 
                                 lowercase=True,
                                 sublinear_tf=True,
                                 min_df=9,
                                 stop_words='english'
                                )
    return vectorizer.fit_transform(x_raw)# + kaggle_x_raw)

def sum_glove():
    x = []
    embedding = np.zeros(glove['.'].shape)
    for tweet in x_raw:
        tokens = tokenizer.tokenize(tweet)
        for word in tokens:
            if word in glove:
                embedding += glove[word]
        x.append(embedding)#/ len(tokens))
    x = np.array(x)
    x = x - np.min(x, axis=1).reshape(x.shape[0], 1)
    x = x / np.max(x, axis=1).reshape(x.shape[0], 1)
    return np.array(x)

x = bow()
# kaggle_x = x[len(x_raw):]
# x = x[:len(x_raw)]

In [33]:
from sklearn.model_selection import KFold
from scipy.sparse import vstack

def test(model):
    k = 10
    kf = KFold(n_splits=k)
    average_acc = 0
    print(6 * '%-8s' % ('acc', 'p', 'r', 'f1w', 'f1m', 'time'))
    averages = np.array([0] * 6, dtype='float')
    name = str(model).split('(')[0] + '.txt'
    results = open(name, 'w')
    for train_index, test_index in kf.split(x):
        start = time()
        # Split based on k-fold
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Append kaggle data to training data
        # x_train = vstack((x_train, kaggle_x))
        # y_train = np.concatenate((y_train, kaggle_y))
        x_train, y_train, i = shuffle_together(x_train, y_train)
        clf = model.fit(x_train, y_train)
        y_hat = clf.predict(x_test)
        for pred in y_hat:
            results.write('%d\n' % int(pred))
        vals = report(y_test, y_hat) + [time() - start]
        averages += vals
        print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(vals))
    results.close()
    print('average:')
    averages /= k
    print('%.4f  %.4f  %.4f  %.4f  %.4f  %.2fs' % tuple(averages))

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

print('Logistic Regression')
test(LogisticRegression(solver='lbfgs', max_iter=300))
print()
print('SVM')
test(SVC(kernel='linear', gamma='auto', C=1.8))

Logistic Regression
acc     p       r       f1w     f1m     time    
0.7470  0.7445  0.3789  0.7198  0.6663  0.13s
0.7704  0.8009  0.4176  0.7466  0.6975  0.12s
0.7613  0.8086  0.4367  0.7393  0.7012  0.12s
0.7598  0.7010  0.3575  0.7324  0.6590  0.12s
0.7591  0.7632  0.3964  0.7338  0.6804  0.12s
0.7711  0.8165  0.4036  0.7453  0.6939  0.13s
0.7863  0.8281  0.4276  0.7632  0.7112  0.12s
0.7568  0.7928  0.3894  0.7295  0.6796  0.12s
0.7787  0.8186  0.4233  0.7552  0.7052  0.14s
0.7606  0.7531  0.4159  0.7380  0.6873  0.12s
average:
0.7651  0.7827  0.4047  0.7403  0.6881  0.12s

SVM
acc     p       r       f1w     f1m     time    
0.7560  0.7071  0.4709  0.7411  0.6979  11.60s
0.7757  0.7370  0.5124  0.7635  0.7240  11.46s
0.7704  0.7724  0.5084  0.7567  0.7250  11.54s
0.7674  0.6667  0.4600  0.7533  0.6941  11.58s
0.7719  0.7438  0.4761  0.7562  0.7120  11.57s
0.7757  0.7400  0.5034  0.7626  0.7217  11.90s
0.7885  0.7483  0.5210  0.7767  0.7343  12.57s
0.7606  0.7368  0.4646  0.7439  0