In [137]:
import csv
from time import time

import numpy as np
import torch
from nltk.tokenize import TweetTokenizer
from sklearn import metrics as skmetrics

use_segmentation = False
embedding_dim = 25
embedding_file = 'data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embedding_dim
olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
olid_hashtags = 'data/olid_segmentations.tsv'
if torch.cuda.is_available():
    device = 0
    print('CUDA available! Using device %d (%s)' % (device, torch.cuda.get_device_name(device)))
else:
    device = None
    print('CUDA unavailable! Using CPU.')

np.random.seed(1234) # helps reproducibility

CUDA unavailable! Using CPU.


In [138]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    # https://discuss.pytorch.org/t/is-there-something-like-keras-utils-to-categorical-in-pytorch/5960
    return np.eye(num_classes, dtype='uint8')[y]

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1-weighted', 'f1-macro']):
    results = []
    metrics = metrics.copy()
    while len(metrics) > 0:
        m = metrics.pop(0)
        if m == 'accuracy':
            results.append(skmetrics.accuracy_score(y, y_hat))
        elif m == 'precision':
            results.append(skmetrics.precision_score(y, y_hat))
        elif m == 'recall':
            results.append(skmetrics.recall_score(y, y_hat))
        elif m == 'f1-weighted':
            results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
        elif m == 'f1-macro':
            results.append(skmetrics.f1_score(y, y_hat, average='macro'))
        else:
            print('Metric unknown: %s' % m)
    return results

# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    '''
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    '''
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
    # Print New Line on Complete
    if iteration == total: 
        print()

In [153]:
start = time()
# Load tweets and labels
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
    
# Load hashtag segmentations
segmentations = {}
for line in open(olid_hashtags):
    terms = [x.strip() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation

# Tokenize data
tokenizer = TweetTokenizer(preserve_case=False)  
x = []
vocab = {}
i = 0
for tweet in x_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if use_segmentation and token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                vocab[word] = i
                i += 1
            example.append(vocab[word])
    x.append(example)
    
#Randomly shuffle
i = np.arange(len(x))
np.random.shuffle(i)
x = [torch.LongTensor(x[k]).to(device) for k in i]
# y = torch.FloatTensor(to_categorical(y[i], 2)).to(device)
y = torch.IntTensor(y[i]).to(device)

split = 0.7
split_index = int(len(x) * split)
x_train = x[:split_index]
y_train = y[:split_index]
x_val = x[split_index:]
y_val = y[split_index:]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 2.27s


In [149]:
# Load embeddings
# On my mac, GloVe loads 25D in 30s, 50D in 100s, 100D in 630s
start = time()
embeddings = {}
with open(embedding_file) as f:
    raw = [row.split() for row in f.readlines()]
    for r in raw:
        embeddings[r[0]] = np.array([float(v) for v in r[1:]])

# Create embedding weight matrix that corresponds to the ids we've already set for x
# https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
start = time()
embedding_weights = np.zeros((len(vocab), embedding_dim))
for word, i in vocab.items():
    try: 
        embedding_weights[i] = embeddings[word]
    except KeyError:
        embedding_weights[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
embedding_weights = torch.FloatTensor(embedding_weights).to(device)
print('Loaded embeddings in %.2fs' % (time() - start)) 

Loaded embeddings in 54.49s
Build embedding matrix in 0.09s


In [164]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class GRU(nn.Module):
    def __init__(self, vocab_size, embeddings=None, dim_emb=10, n_classes=2, device=0):
        super(GRU, self).__init__()
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.device = device
        
        n_filters = 200
        if embeddings is None:
            self.dim_emb = dim_emb
            self.embedding = nn.Embedding(self.vocab_size, self.dim_emb)
        else:
            self.dim_emb = embeddings.shape[1]
            self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.conv1_1 = nn.Conv1d(self.dim_emb, n_filters, 1)
        self.pool = nn.AdaptiveMaxPool1d(1) # max pooling over time
        self.act = nn.ReLU()
        self.fc = nn.Linear(n_filters, self.n_classes)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x, train=False):
        x = self.embedding(x)
        x = x.t()
        # conv input: (batch_size, dim_emb, review_length)
        x = x.view(1, x.shape[0], x.shape[1])
        x = self.conv1_1(x)
        # ccat output: (batch_size, dim_emb, review_length)
        x = self.pool(x)
        x = torch.flatten(x.view(x.shape[1], 1))
        x = self.act(x)
        x = self.fc(x)
        y = self.softmax(x)
        return y
    
    def predict(self, x, one_hot=False):
        if one_hot:
            y = np.zeros((len(x), self.n_classes))
        else:
            y = np.zeros((len(x)))
            
        for i in range(len(x)):
            logits = self.forward(x[i], train=False)
            pred = torch.argmax(logits)
            if one_hot:
                y[i, pred] = 1
            else:
                y[i] = pred
        return y
    
def train(x_train, y_train, x_val, y_val, vocab_size, epochs):
    print('Start Training!')
    mlp = GRU(vocab_size, embedding_weights)
    if torch.cuda.is_available():
        mlp.cuda()
    optimizer = optim.Adam(mlp.parameters(), lr=0.001)
    batch_size = 1
    max_f1 = 0
    for epoch in range(epochs):
        print('-------------')
        print('Epoch %d' % epoch)
        start = time()
        total_loss = 0.0
        for i in range(0, len(x_train), batch_size):
            mlp.zero_grad()
            probs = mlp.forward(x_train[i])
            onehot = torch.zeros(2)
            onehot[y_train[i]] = 1
            loss = torch.neg(torch.log(probs)).dot(onehot)            
            total_loss += loss
            loss.backward()
            optimizer.step()
            if i % 2 == 0:
                p = '%d/%d' % (i+2, len(x_train))
                printProgressBar(i+2, len(x_train), prefix=p, length=60)
        print('loss: %.4f' % total_loss)
        print('time: %.2fs' % (time() - start))
        metrics = ['accuracy', 'f1-weighted', 'f1-macro']
        t_acc, t_f1w, t_f1m = report(y_train, mlp.predict(x_train), metrics=metrics)
        v_acc, v_f1w, v_f1m = report(y_val, mlp.predict(x_val), metrics=metrics)
        print('train_acc: %.4f   f1_weighted: %.4f  f1_macro: %.4f' % (t_acc, t_f1w, t_f1m))
        print('val_acc:   %.4f   f1_weighted: %.4f  f1_macro: %.4f' % (v_acc, v_f1w, v_f1m))
        if v_f1m > max_f1:
            print('New best macro f1! Saving model.')
            torch.save(mlp.state_dict(), 'models/best-epoch%d.model' % epoch)
            max_f1 = v_f1m
    return mlp

In [None]:
limit = 300 len(x_train) # helpful for quick debugging. len(x_train) means no limit
epochs = 2
mlp = train(x_train[:limit], y_train[:limit], x_val, y_val, len(vocab), epochs)

Start Training!
-------------
Epoch 0
300/300 |████████████████████████████████████████████████████████████| 100.0% 
loss: 206.4463
time: 0.53s
train_acc: 0.4433   f1_weighted: 0.3666  f1_macro: 0.4103
val_acc:   0.3797   f1_weighted: 0.2819  f1_macro: 0.3400
New best macro f1! Saving model.
-------------
Epoch 1
300/300 |████████████████████████████████████████████████████████████| 100.0% 
loss: 183.2317
time: 0.53s


In [159]:
embedding_weights.shape[1]

25