In [2]:
import csv
from time import time

import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack

olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
kaggle_data_folder = 'data/jigsaw/'
bad_words_data = 'data/trimmed-bad-words.txt'
glove_data = 'data/glove.twitter.27B/glove.twitter.27B.25d.txt' # 25, 50, 100, or 200 D

np.random.seed(1234) # help reproducibility

In [3]:
# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data) as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])

with open(kaggle_data_folder + 'train.csv') as f:  
    raw = csv.reader(f, delimiter=',')
    kaggle_x_raw = []
    kaggle_y = []
    for r in raw:
        kaggle_x_raw.append(r[1])
        kaggle_y.append(0 if all(x == '0' for x in r[2:]) else 1)
    kaggle_x_raw = kaggle_x_raw[1:]
    kaggle_y = np.array(kaggle_y[1:])
     
with open(bad_words_data) as f:
    bad_words = [row[:-1] for row in f.readlines()[1:]]

print('Loaded data in %.2fs' % (time() - start))

Loaded data in 1.88s


In [5]:
start = time()
glove = {}
with open(glove_data) as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        glove[r[0]] = np.array([float(v) for v in r[1:]])
print('Loaded GloVe in %.2fs' % (time() - start))
# On my mac, loads 25D in 30s, 50D in 100s, 100D in 630s

Loaded GloVe in 25.22s


In [6]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = TweetTokenizer(preserve_case=False)
    
def bow():
    # Build vocabulary from OLID data only
    start = time()
    vocab = {}
    i = 0 # index of unique word
    for tweet in x_raw: 
        for word in tokenizer.tokenize(tweet):
            if word not in vocab:
                vocab[word] = i
                i += 1
    print('Vocabulary built in %.2fs' % (time() - start))

    vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize, vocabulary=vocab, lowercase=True)
    return vectorizer.fit_transform(x_raw) # + kaggle_x_raw)

def sum_glove():
    x = []
    embedding = np.zeros(glove['.'].shape)
    for tweet in x_raw:
        tokens = tokenizer.tokenize(tweet)
        for word in tokens:
            if word in glove:
                embedding += glove[word]
        x.append(embedding)#/ len(tokens))
    x = np.array(x)
    x = x - np.min(x, axis=1).reshape(x.shape[0], 1)
    x = x / np.max(x, axis=1).reshape(x.shape[0], 1)
    return np.array(x)

x = sum_glove()
#kaggle_x = x[len(x_raw):]
#x = x[:len(x_raw)]

In [7]:
def shuffle_together(x, y):
    # Shuffle x and y together
    state = np.random.get_state()
    i = np.arange(x.shape[0])
    np.random.shuffle(i)
    np.random.set_state(state)
    k = np.arange(y.shape[0])
    np.random.shuffle(k)
    return x[i, :], y[k] # shuffling a sparse matrix is weird

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from scipy.sparse import vstack

k = 10
kf = KFold(n_splits=k)
average_acc = 0
for train_index, test_index in kf.split(x):
    # Split based on k-fold
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Append kaggle data to training data
    # x_train = vstack((x_train, kaggle_x))
    # y_train = np.concatenate((y_train, kaggle_y))
    x_train, y_train = shuffle_together(x_train, y_train)
    clf = LogisticRegression(solver='lbfgs', max_iter=300).fit(x_train, y_train)
    y_hat = clf.predict(x_test)
    acc = y_hat[np.where(y_hat == y_test)].size / y_test.size
    average_acc += acc
    print('acc: %.4f' % acc)
print('average acc: %.4f' % (average_acc / k))

acc: 0.6631
acc: 0.6654
acc: 0.6420
acc: 0.6979
acc: 0.6684
acc: 0.6669
acc: 0.6767
acc: 0.6586
acc: 0.6699
acc: 0.6677
average acc: 0.6677


In [None]:
m = [
    [
        [1, 2],
        [3, 4]
    ]
]

In [None]:
np.average(m[0], axis=0)

In [37]:
a = np.array([[1, 2], [3, 4]])

In [38]:
np.min(a, axis=1)

array([1, 3])

In [45]:
b = np.min(a, axis=1)

In [46]:
b.shape

(2,)

In [47]:
b.flatten().shape

(2,)

In [48]:
b.reshape(2, 1).shape

(2, 1)