In [1]:
import csv
import os
from time import time

import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import metrics as skmetrics
import torch
torch.backends.cudnn.enabled = False

embedding_dim = 25
embedding_file = 'data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embedding_dim
training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'
if torch.cuda.is_available():
    device = 0 
    print('CUDA available! Using device %d (%s)' % (device, torch.cuda.get_device_name(device)))
else:
    device = None
    print('CUDA unavailable! Using CPU.')

np.random.seed(1234) # helps reproducibility

CUDA available! Using device 0 (GeForce GTX 1070)


In [2]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    # https://discuss.pytorch.org/t/is-there-something-like-keras-utils-to-categorical-in-pytorch/5960
    return np.eye(num_classes, dtype='uint8')[y]

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    metrics = metrics.copy()
    while len(metrics) > 0:
        m = metrics.pop(0)
        if m == 'accuracy':
            results.append(skmetrics.accuracy_score(y, y_hat))
        elif m == 'precision':
            results.append(skmetrics.precision_score(y, y_hat))
        elif m == 'recall':
            results.append(skmetrics.recall_score(y, y_hat))
        elif m == 'f1-weighted':
            results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
        elif m == 'f1-macro':
            results.append(skmetrics.f1_score(y, y_hat, average='macro'))
        else:
            print('Metric unknown: %s' % m)
    return results

In [3]:
start = time()
with open(training_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_train_raw = []
    y_train = []
    for r in raw:
        x_train_raw.append(r[1])
        y_train.append(0 if r[2] == 'NOT' else 1)
    x_train_raw = x_train_raw[1:]
    y_train = y_train[1:]
    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y_test = []
    for r in raw:
        y_test.append(0 if r[1] == 'NOT' else 1)
    
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
    
print('Loaded data in %.2fs' % (time() - start))

# Tokenize data
tokenizer = TweetTokenizer(preserve_case=False)  
x_train = []
vocab = {}
i = 2 # start from 2. 0 is unk, 1 is pad
for tweet in x_train_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                vocab[word] = i
                i += 1
            example.append(vocab[word])
    x_train.append(example)

x_test = []
for tweet in x_test_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                example.append(0)
            else:
                example.append(vocab[word])
    x_test.append(example)
        
y_train = np.array(y_train)
y_test = np.array(y_test)

# Randomly shuffle
i = np.arange(len(x_train))  
np.random.shuffle(i)
x_train = [torch.LongTensor(x_train[k]).to(device) for k in i]
y_train = torch.FloatTensor(to_categorical(y_train[i], 2)).to(device)

x_test = [torch.LongTensor(x_test[k]).to(device) for k in range(len(x_test))]
y_test = torch.FloatTensor(to_categorical(y_test, 2)).to(device)

# split = 0.7
# split_index = int(len(x) * split)
# x_train = x[:split_index]
# y_train = y[:split_index]
# x_val = x[split_index:]
# y_val = y[split_index:]
print('Preprocessed data in %.2fs' % (time() - start))

Loaded data in 0.05s
Preprocessed data in 3.83s


In [4]:
# Load embeddings
# On my mac, GloVe loads 25D in 30s, 50D in 100s, 100D in 630s
start = time()
embeddings = {}
with open(embedding_file, encoding='utf-8') as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        embeddings[r[0]] = np.array([float(v) for v in r[1:]])

# Create embedding weight matrix that corresponds to the ids we've already set for x
# https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
embedding_weights = np.zeros((len(vocab) + 2, embedding_dim)) # add 1 to account for unk and pad
for word, i in vocab.items():
    try: 
        embedding_weights[i] = embeddings[word]
    except KeyError:
        embedding_weights[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
embedding_weights = torch.FloatTensor(embedding_weights)
print('Loaded embeddings in %.2fs' % (time() - start)) 

Loaded embeddings in 15.89s


In [5]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn

class GRU(nn.Module):
    def __init__(self, vocab_size, embeddings=None, dim_emb=25, n_classes=2, device=0):
        super(GRU, self).__init__()
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.device = device
        
        bidirectional = False
        gru_size = 32
        if embeddings is None:
            self.dim_emb = dim_emb
            self.embedding = nn.Embedding(self.vocab_size, self.dim_emb)
        else:
            self.dim_emb = embeddings.shape[1]
            self.embedding = nn.Embedding.from_pretrained(embeddings) # defaults to frozen
        self.gru = nn.GRU(input_size=self.dim_emb,
                          hidden_size=gru_size,
                          num_layers=2,
                          batch_first=True,
                          bidirectional=bidirectional,
                          dropout=0.2
                         )
        if bidirectional:
            self.fc1 = nn.Linear(2 * gru_size, self.n_classes)
        else:
            self.fc1 = nn.Linear(gru_size, self.n_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.embedding(x)
        out, h = self.gru(x)
        x = h.permute(1, 0, 2)
        x = torch.flatten(x, 1)
        x = self.relu(out[:, -1])
        y = self.softmax(self.fc1(x))
        return y
    
    def predict(self, x, one_hot=False, batch_size=32):
        if one_hot:
            y = torch.zeros((len(x), self.n_classes))
        else:
            y = torch.zeros((len(x)))
        for i in range(0, len(x), batch_size):
            batch = rnn.pack_sequence(x[i:i+batch_size], enforce_sorted=False)
            batch, _ = rnn.pad_packed_sequence(batch, batch_first=True)
            logits = self.forward(batch)
            preds = torch.max(logits, 1)[1]
            if one_hot:
                pass # broken
                # y[i:i+batch_size, preds] = 1
            else:
                y[i:i+batch_size] = preds
        return y

history = []
def train(x_train, y_train, x_val, y_val, vocab_size, batch_size, lr, epochs):
    print('Start Training!')
    global history
    mlp = GRU(vocab_size, embedding_weights)
    if torch.cuda.is_available():
        mlp.cuda()
    optimizer = optim.Adam(mlp.parameters(), lr=lr)
    max_f1 = 0
    print('epoch time   t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w')
    for epoch in range(epochs):
        start = time()
        total_loss = 0.0
        for i in range(0, len(x_train), batch_size):
            mlp.zero_grad()
            batch = rnn.pack_sequence(x_train[i:i+batch_size], enforce_sorted=False)
            batch, _ = rnn.pad_packed_sequence(batch, batch_first=True)
            probs = mlp.forward(batch)
            NLL = torch.neg(torch.log(probs))
            y_batch = y_train[i:i+batch_size]
            losses = torch.bmm(NLL.view(len(y_batch), 1, 2), y_batch.view(len(y_batch), 2, 1))
            loss = torch.sum(losses) / len(y_batch)
            total_loss += loss
            loss.backward()
            optimizer.step()
        metrics = ['accuracy', 'f1-weighted', 'f1-macro']
        t_y_hat = mlp.predict(x_train, False, 1024)
        t_dense = torch.max(y_train, 1)[1]
        t_acc, t_f1w, t_f1m = report(t_dense.cpu(), t_y_hat, metrics=metrics)
        v_y_hat = mlp.predict(x_val, False, 1024)
        v_dense = torch.max(y_val, 1)[1]
        v_acc, v_f1w, v_f1m = report(v_dense.cpu(), v_y_hat, metrics=metrics)
        vals = (epoch, time() - start, total_loss.item(), t_acc, v_acc, t_f1m, v_f1m, v_f1w)
        history.append(vals)
        print('\r%-5d %5.2fs %08.4f %.4f %.4f %.4f %.4f %.4f' % vals, end='')
        if v_f1m > max_f1:
             print('\tNew best macro f1! Saving model and predictions.')
             torch.save(mlp.state_dict(), 'models/best-bigru.model')
             with open('predictions/best-bigru.txt', 'w') as f:
                for pred in v_dense:
                    f.write('%d\n' % pred)
             max_f1 = v_f1m
    return mlp

In [None]:
# helpful for quick debugging. [0, len(x_train)) means no limit
lower = 0
upper = len(x_train)
batch_size = 32
lr = 3e-4
epochs = 10000
mlp = train(x_train[lower:upper], y_train[lower:upper], x_test, y_test, len(vocab), batch_size, lr, epochs)

Start Training!
epoch time   t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w
0     52.46s 1434.5070 0.4921 0.4988 0.4773 0.4728 0.5246	New best macro f1! Saving model and predictions.
1     52.54s 1434.2445 0.5087 0.5570 0.4907 0.5097 0.5770	New best macro f1! Saving model and predictions.
2     50.64s 1428.2655 0.6637 0.7244 0.6437 0.6708 0.7295	New best macro f1! Saving model and predictions.
3     50.40s 1405.1337 0.7007 0.7279 0.6863 0.6908 0.7381	New best macro f1! Saving model and predictions.
4     50.25s 1386.3075 0.7183 0.7244 0.7043 0.6940 0.7366	New best macro f1! Saving model and predictions.
5     50.17s 1380.5710 0.7295 0.7372 0.7147 0.7070 0.7486	New best macro f1! Saving model and predictions.
7     50.48s 1375.1205 0.7355 0.7395 0.7202 0.7096 0.7508	New best macro f1! Saving model and predictions.
10    50.27s 1371.1011 0.7390 0.7500 0.7250 0.7211 0.7608	New best macro f1! Saving model and predictions.
15    50.23s 1366.3676 0.7443 0.7500 0.7303 0.7215 0.7609	New best macro