In [1]:
import csv
import os
from time import time

# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import metrics as skmetrics
import torch
torch.backends.cudnn.enabled = False

use_segmentation = False
embedding_dim = 25
embedding_file = 'data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embedding_dim
olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
olid_hashtags = 'data/olid_segmentations.tsv'
if torch.cuda.is_available():
    device = 0 
    print('CUDA available! Using device %d (%s)' % (device, torch.cuda.get_device_name(device)))
else:
    device = None
    print('CUDA unavailable! Using CPU.')

np.random.seed(1234) # helps reproducibility

CUDA available! Using device 0 (GeForce GTX 1070)


In [2]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    # https://discuss.pytorch.org/t/is-there-something-like-keras-utils-to-categorical-in-pytorch/5960
    return np.eye(num_classes, dtype='uint8')[y]

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    metrics = metrics.copy()
    while len(metrics) > 0:
        m = metrics.pop(0)
        if m == 'accuracy':
            results.append(skmetrics.accuracy_score(y, y_hat))
        elif m == 'precision':
            results.append(skmetrics.precision_score(y, y_hat))
        elif m == 'recall':
            results.append(skmetrics.recall_score(y, y_hat))
        elif m == 'f1-weighted':
            results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
        elif m == 'f1-macro':
            results.append(skmetrics.f1_score(y, y_hat, average='macro'))
        else:
            print('Metric unknown: %s' % m)
    return results

# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    '''
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    '''
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
    # Print New Line on Complete
    if iteration == total: 
        print()

In [3]:
start = time()
# Load tweets and labels
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
    
# Load hashtag segmentations
segmentations = {}
for line in open(olid_hashtags, encoding='utf-8'):
    terms = [x.strip() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation

# Tokenize data
tokenizer = TweetTokenizer(preserve_case=False)  
x = []
vocab = {}
i = 1 # start from 1. 0 is pad.
for tweet in x_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if use_segmentation and token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                vocab[word] = i
                i += 1
            example.append(vocab[word])
    x.append(example)
    
# Randomly shuffle
# As the result of a 3 hour long bug hunt,
# we have to subtract 1 because cudnn throws a fit if it tries to access
# the example at the end of the file. Some illegal memory access error?
# This is wack and I'm mad.
i = np.arange(len(x) - 1)  
np.random.shuffle(i)
x = [torch.LongTensor(x[k]).to(device) for k in i]
y = torch.FloatTensor(to_categorical(y[i], 2)).to(device)

split = 0.7
split_index = int(len(x) * split)
x_train = x[:split_index]
y_train = y[:split_index]
x_val = x[split_index:]
y_val = y[split_index:]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 3.72s


In [4]:
# Load embeddings
# On my mac, GloVe loads 25D in 30s, 50D in 100s, 100D in 630s
start = time()
embeddings = {}
with open(embedding_file, encoding='utf-8') as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        embeddings[r[0]] = np.array([float(v) for v in r[1:]])

# Create embedding weight matrix that corresponds to the ids we've already set for x
# https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
embedding_weights = np.zeros((len(vocab) + 1, embedding_dim)) # add 1 to account for pad
for word, i in vocab.items():
    try: 
        embedding_weights[i] = embeddings[word]
    except KeyError:
        embedding_weights[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
embedding_weights = torch.HalfTensor(embedding_weights)
print('Loaded embeddings in %.2fs' % (time() - start)) 

Loaded embeddings in 15.37s


In [5]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn

class GRU(nn.Module):
    def __init__(self, vocab_size, embeddings=None, dim_emb=10, n_classes=2, device=0):
        super(GRU, self).__init__()
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.device = device
        
        gru_size = 256
        if embeddings is None:
            self.dim_emb = dim_emb
            self.embedding = nn.Embedding(self.vocab_size, self.dim_emb)
        else:
            self.dim_emb = embeddings.shape[1]
            self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.gru = nn.GRU(input_size=self.dim_emb,
                          hidden_size=gru_size,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True
                         )
        #self.fc1 = nn.Linear(gru_size, fc_size)
        #self.fc2 = nn.Linear(fc_size, self.n_classes)
        self.fc = nn.Linear(2 * gru_size, self.n_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.embedding(x)
        out, h = self.gru(x)
        x = self.relu(out[:, -1])
        # x = self.relu(self.fc1(x))
        # x = self.relu(self.fc2(x))
        x = self.fc(x)
        y = self.softmax(x)
        return y
    
    def predict(self, x, one_hot=False, batch_size=32):
        if one_hot:
            y = torch.zeros((len(x), self.n_classes))
        else:
            y = torch.zeros((len(x)))
        for i in range(0, len(x), batch_size):
            batch = rnn.pack_sequence(x[i:i+batch_size], enforce_sorted=False)
            batch, _ = rnn.pad_packed_sequence(batch, batch_first=True)
            logits = self.forward(batch)
            preds = torch.max(logits, 1)[1]
            if one_hot:
                pass # broken
                # y[i:i+batch_size, preds] = 1
            else:
                y[i:i+batch_size] = preds
        return y
    
def train(x_train, y_train, x_val, y_val, vocab_size, batch_size, lr, epochs):
    print('Start Training!')
    mlp = GRU(vocab_size, None)
    if torch.cuda.is_available():
        mlp.cuda()
    optimizer = optim.Adam(mlp.parameters(), lr=lr)
    max_f1 = 0
    print('epoch time  t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w')
    for epoch in range(epochs):
        # print('-------------')
        # print('Epoch %d' % epoch)
        start = time()
        total_loss = 0.0
        for i in range(0, len(x_train), batch_size):
            mlp.zero_grad()
            batch = rnn.pack_sequence(x_train[i:i+batch_size], enforce_sorted=False)
            batch, _ = rnn.pad_packed_sequence(batch, batch_first=True)
            probs = mlp.forward(batch)
            NLL = torch.neg(torch.log(probs))
            y_batch = y_train[i:i+batch_size]
            losses = torch.bmm(NLL.view(len(y_batch), 1, 2), y_batch.view(len(y_batch), 2, 1))
            loss = torch.sum(losses) / len(y_batch)
            total_loss += loss
            loss.backward()
            optimizer.step()
        metrics = ['accuracy', 'f1-weighted', 'f1-macro']
        t_y_hat = mlp.predict(x_train, False)
        t_dense = torch.max(y_train, 1)[1]
        t_acc, t_f1w, t_f1m = report(t_dense.cpu(), t_y_hat, metrics=metrics)
        v_y_hat = mlp.predict(x_val, False)
        v_dense = torch.max(y_val, 1)[1]
        v_acc, v_f1w, v_f1m = report(v_dense.cpu(), v_y_hat, metrics=metrics)
        print('\r%-5d %.2fs %08.4f %.4f %.4f %.4f %.4f %.4f' % 
              (epoch, time() - start, total_loss, t_acc, v_acc, t_f1m, v_f1m, v_f1w), end='')
        if v_f1m > max_f1:
             print('\tNew best macro f1! Saving model.')
             torch.save(mlp.state_dict(), 'models/best-f1%d.model' % epoch)
             max_f1 = v_f1m
    return mlp

In [None]:
# helpful for quick debugging. [0, len(x_train)) means no limit
lower = 0
upper = 512
batch_size = 512
lr = 0.0001
epochs = 1000
mlp = train(x_train[lower:upper], y_train[lower:upper], x_train[lower:upper], y_train[lower:upper], len(vocab), batch_size, lr, epochs)

Start Training!
epoch time  t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w
0     1.21s 006.2383 0.3203 0.3203 0.2849 0.2849 0.2195	New best macro f1! Saving model.
1     1.34s 006.2383 0.3223 0.3223 0.2863 0.2863 0.2205	New best macro f1! Saving model.
22    1.22s 006.2381 0.3223 0.3223 0.2876 0.2876 0.2231	New best macro f1! Saving model.
25    1.18s 006.2380 0.3262 0.3262 0.2929 0.2929 0.2301	New best macro f1! Saving model.
40    1.14s 006.2379 0.3281 0.3281 0.2956 0.2956 0.2335	New best macro f1! Saving model.
46    1.14s 006.2379 0.3281 0.3281 0.2968 0.2968 0.2360	New best macro f1! Saving model.
53    1.16s 006.2378 0.3262 0.3262 0.2978 0.2978 0.2398	New best macro f1! Saving model.
54    1.15s 006.2378 0.3301 0.3301 0.3041 0.3041 0.2489	New best macro f1! Saving model.
55    1.16s 006.2378 0.3320 0.3320 0.3066 0.3066 0.2522	New best macro f1! Saving model.
56    1.16s 006.2378 0.3340 0.3340 0.3092 0.3092 0.2555	New best macro f1! Saving model.
58    1.13s 006.2378 0.3379 0.3379 0.31