In [1]:
import csv
import os
from time import time

import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import metrics as skmetrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn

torch.backends.cudnn.enabled = False
embedding_dim = 25

embedding_file = 'data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embedding_dim
training_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
test_data = 'data/OLIDv1.0/testset-levela.tsv'
test_labels = 'data/OLIDv1.0/labels-levela.csv'
hashtags = 'data/olid_segmentations.tsv'

if torch.cuda.is_available():
    device = 0 
    print('CUDA available! Using device %d (%s)' % (device, torch.cuda.get_device_name(device)))
else:
    device = None
    print('CUDA unavailable! Using CPU.')

np.random.seed(1234) # helps reproducibility

CUDA available! Using device 0 (GeForce GTX 1070)


In [2]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    # https://discuss.pytorch.org/t/is-there-something-like-keras-utils-to-categorical-in-pytorch/5960
    return np.eye(num_classes, dtype='uint8')[y]

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    metrics = metrics.copy()
    while len(metrics) > 0:
        m = metrics.pop(0)
        if m == 'accuracy':
            results.append(skmetrics.accuracy_score(y, y_hat))
        elif m == 'precision':
            results.append(skmetrics.precision_score(y, y_hat))
        elif m == 'recall':
            results.append(skmetrics.recall_score(y, y_hat))
        elif m == 'f1-weighted':
            results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
        elif m == 'f1-macro':
            results.append(skmetrics.f1_score(y, y_hat, average='macro'))
        else:
            print('Metric unknown: %s' % m)
    return results

In [19]:
start = time()
with open(training_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_train_raw = []
    y_train = []
    for r in raw:
        x_train_raw.append(r[1])
        y_train.append(0 if r[2] == 'NOT' else 1)
    x_train_raw = x_train_raw[1:]
    y_train = y_train[1:]
    
with open(test_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    test_ids = []
    x_test_raw = []
    for r in raw:
        test_ids.append(r[0])
        x_test_raw.append(r[1])
    test_ids = [int(i) for i in test_ids[1:]]
    x_test_raw = x_test_raw[1:]
        
with open(test_labels, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter=',')
    y_test = []
    for r in raw:
        y_test.append(0 if r[1] == 'NOT' else 1)
    
segmentations = {}
for line in open(hashtags, encoding='utf-8'):
    terms = [x.strip().lower() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
    
print('Loaded data in %.2fs' % (time() - start))

# Tokenize data
tokenizer = TweetTokenizer(preserve_case=False)  
x_train = []
vocab = {}
i = 2 # start from 2. 0 is pad, 1 is unk
for tweet in x_train_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                vocab[word] = i
                i += 1
            example.append(vocab[word])
    x_train.append(example)

x_test = []
for tweet in x_test_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                example.append(1) # <unk>
            else:
                example.append(vocab[word])
    x_test.append(example)
        
y_train = np.array(y_train)
y_test = np.array(y_test)

# Randomly shuffle
i = np.arange(len(x_train))  
np.random.shuffle(i)
x_train = rnn.pad_sequence([torch.LongTensor(x_train[k]).to(device) for k in i], batch_first=True)
y_train = torch.FloatTensor(to_categorical(y_train[i], 2)).to(device)

x_test = rnn.pad_sequence([torch.LongTensor(x_test[k]).to(device) for k in range(len(x_test))], batch_first=True)
y_test = torch.FloatTensor(to_categorical(y_test, 2)).to(device)

print('Preprocessed data in %.2fs' % (time() - start))

Loaded data in 0.05s
Preprocessed data in 2.82s


In [4]:
# Load embeddings
# On my mac, GloVe loads 25D in 30s, 50D in 100s, 100D in 630s
start = time()
embeddings = {}
with open(embedding_file, encoding='utf-8') as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        embeddings[r[0]] = np.array([float(v) for v in r[1:]])

# Create embedding weight matrix that corresponds to the ids we've already set for x
# https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
embedding_weights = np.zeros((len(vocab) + 2, embedding_dim)) # add 1 to account for unk and pad
for word, i in vocab.items():
    try: 
        embedding_weights[i] = embeddings[word]
    except KeyError:
        embedding_weights[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
embedding_weights = torch.FloatTensor(embedding_weights)
print('Loaded embeddings in %.2fs' % (time() - start)) 

Loaded embeddings in 15.28s


In [24]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embeddings=None, dim_emb=25, n_classes=2, device=0):
        super(GRU, self).__init__()
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.device = device
        
        bidirectional = False
        gru_size = 32
        if embeddings is None:
            self.dim_emb = dim_emb
            self.embedding = nn.Embedding(self.vocab_size, self.dim_emb)
        else:
            self.dim_emb = embeddings.shape[1]
            self.embedding = nn.Embedding.from_pretrained(embeddings) # defaults to frozen
        self.gru = nn.GRU(input_size=self.dim_emb,
                          hidden_size=gru_size,
                          num_layers=2,
                          batch_first=True,
                          bidirectional=bidirectional,
                          dropout=0.2
                         )
        if bidirectional:
            self.fc1 = nn.Linear(2 * gru_size, self.n_classes)
        else:
            self.fc1 = nn.Linear(gru_size, self.n_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.embedding(x)
        out, h = self.gru(x)
        x = h.permute(1, 0, 2)
        x = torch.flatten(x, 1)
        x = self.relu(out[:, -1])
        y = self.softmax(self.fc1(x))
        return y
    
    def predict(self, x, batch_size=32):
        y = torch.zeros((len(x)))
        for i in range(0, len(x), batch_size):
            logits = self.forward(x[i:i+batch_size])
            preds = torch.max(logits, 1)[1]
            y[i:i+batch_size] = preds
        return y

history = []
def train(x_train, y_train, x_val, y_val, vocab_size, batch_size, lr, epochs):
    print('Start Training!')
    global history
    mlp = GRU(vocab_size, embedding_weights)
    if torch.cuda.is_available():
        mlp.cuda()
    optimizer = optim.Adam(mlp.parameters(), lr=lr)
    t_dense = torch.max(y_train, 1)[1].cpu()
    v_dense = torch.max(y_val, 1)[1].cpu()
    max_f1 = 0
    print('epoch time   t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w')
    for epoch in range(epochs):
        start = time()
        total_loss = 0.0
        for i in range(0, len(x_train), batch_size):
            mlp.zero_grad()
            probs = mlp.forward(x_train[i:i+batch_size])
            NLL = torch.neg(torch.log(probs))
            y_batch = y_train[i:i+batch_size]
            losses = torch.bmm(NLL.view(len(y_batch), 1, 2), y_batch.view(len(y_batch), 2, 1))
            loss = torch.sum(losses) / len(y_batch)
            total_loss += loss
            loss.backward()
            optimizer.step()
        metrics = ['accuracy', 'f1-weighted', 'f1-macro']
        t_y_hat = mlp.predict(x_train, 1024)
        t_acc, t_f1w, t_f1m = report(t_dense, t_y_hat, metrics=metrics)
        v_y_hat = mlp.predict(x_val, 1024)
        v_acc, v_f1w, v_f1m = report(v_dense, v_y_hat, metrics=metrics)
        vals = (epoch, time() - start, total_loss.item(), t_acc, v_acc, t_f1m, v_f1m, v_f1w)
        history.append(vals)
        print('\r%-5d %5.2fs %08.4f %.4f %.4f %.4f %.4f %.4f' % vals, end='')
        if v_f1m > max_f1:
            print('\tNew best macro f1! Saving model and predictions.')
            torch.save(mlp.state_dict(), 'models/best-bigru.model')
            with open('predictions/best-bigru.txt', 'w') as f:
                for pred in v_y_hat:
                    f.write('%d\n' % pred)
            max_f1 = v_f1m
    return mlp

In [25]:
# helpful for quick debugging. [0, len(x_train)) means no limit
lower = 0
upper = 300 # len(x_train)
batch_size = 32
lr = 3e-4
epochs = 10000
mlp = train(x_train[lower:upper], y_train[lower:upper], x_test, y_test, len(vocab), batch_size, lr, epochs)

Start Training!
epoch time   t_loss   t_acc  v_acc  t_f1m  v_f1m  v_f1w


TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
model = GRU(len(vocab), embedding_weights)
model.cuda()
model.load_state_dict(torch.load('models/best-bigru-7743-32-False-2-0.2.model'))

In [None]:
metrics = ['accuracy', 'f1-weighted', 'f1-macro']

def predict(self, x):
    y = torch.zeros((len(x)))
    for i in range(0, len(x)):
        batch = x[i].view(1, x[i].shape[0])
        logits = self.forward(batch)
        preds = torch.max(logits, 1)[1]
        y[i:i+batch_size] = preds
    return y

# y_hat = model.predict(x_train)
# dense = torch.max(y_train, 1)[1].cpu()
# print(report(dense, y_hat, metrics=metrics))

# y_hat = predict(model, x_test)
# dense = torch.max(y_test, 1)[1].cpu()
# print(report(dense, y_hat, metrics=metrics))

maxlen = 0
for x in x_train:
    if len(x) > maxlen:
        maxlen = len(x)
        print(len(x))

# y_hat = predict(model, x_test)
# dense = torch.max(y_test, 1)[1].cpu()
# print(report(dense, y_hat, metrics=metrics))
        
# with open('models/history.csv', 'w') as f:
#     f.write('epoch,time,loss,train_acc,val_acc,train_f1m,val_f1m,val_f1w\n')
#     for i in range(len(history)):
#         f.write('%d,%.2f,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f\n' % history[i])