In [32]:
import csv
from time import time

import numpy as np
import torch
from nltk.tokenize import TweetTokenizer
from sklearn import metrics as skmetrics

use_segmentation = False
olid_data = 'data/OLIDv1.0/olid-training-v1.0.tsv'
olid_hashtags = 'data/olid_segmentations.tsv'
if torch.cuda.is_available():
    device = 0
    print('CUDA available! Using device %d (%s)' % (device, torch.cuda.get_device_name(device)))
else:
    device = None
    print('CUDA unavailable! Using CPU.')

np.random.seed(1234) # helps reproducibility

CUDA unavailable! Using CPU.


In [33]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    # https://discuss.pytorch.org/t/is-there-something-like-keras-utils-to-categorical-in-pytorch/5960
    return np.eye(num_classes, dtype='uint8')[y]

def report(y, y_hat, metrics=['accuracy', 'precision', 'recall', 'f1', 'auc']):
    results = []
    if 'accuracy' in metrics or 'acc' in metrics:
        results.append(skmetrics.accuracy_score(y, y_hat))
    if 'precision' in metrics:
        results.append(skmetrics.precision_score(y, y_hat))
    if 'recall' in metrics:
        results.append(skmetrics.recall_score(y, y_hat))
    if 'f1' in metrics:
        results.append(skmetrics.f1_score(y, y_hat, average='weighted'))
    if 'auc' in metrics:
        results.append(skmetrics.roc_auc_score(y, y_hat))
    return results

# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def printProgressBar (iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    '''
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    '''
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
    # Print New Line on Complete
    if iteration == total: 
        print()

In [34]:
# y == 0 if not offensive
# y == 1 if offensive
start = time()
with open(olid_data, encoding='utf-8') as f:
    raw = csv.reader(f, delimiter='\t')
    x_raw = []
    y = []
    for r in raw:
        x_raw.append(r[1])
        y.append(0 if r[2] == 'NOT' else 1)
    x_raw = x_raw[1:]
    y = np.array(y[1:])
    bad_words = [row[:-1] for row in f.readlines()[1:]]
    
segmentations = {}
for line in open(olid_hashtags):
    terms = [x.strip() for x in line.split('\t')]
    hashtag, segmentation = terms[0], terms[1]
    segmentations[hashtag] = segmentation
    
tokenizer = TweetTokenizer(preserve_case=False)  
x = []
vocab = {}
i = 0
for tweet in x_raw:
    example = []
    for token in tokenizer.tokenize(tweet):
        # if it's a hashtag, look up segmentaion
        if use_segmentation and token[0] == '#' and token[1:] in segmentations:
            sequence = segmentations[token[1:]].split()
        else:
            sequence = [token]
            
        for word in sequence:
            if word not in vocab:
                vocab[word] = i
                i += 1
            example.append(vocab[word])
    x.append(example)
    
#Randomly shuffle
i = np.arange(len(x))
np.random.shuffle(i)
x = [torch.LongTensor(x[k]).to(device) for k in i]
y = torch.FloatTensor(to_categorical(y[i], 2)).to(device)

split = 0.7
split_index = int(len(x) * split)
x_train = x[:split_index]
y_train = y[:split_index]
x_val = x[split_index:]
y_val = y[split_index:]
print('Loaded data in %.2fs' % (time() - start))

Loaded data in 2.35s


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CNN(nn.Module):
    def __init__(self, X, Y, VOCAB_SIZE, DIM_EMB=10, NUM_CLASSES=2, device=0):
        super(CNN, self).__init__()
        (self.VOCAB_SIZE, self.DIM_EMB, self.NUM_CLASSES) = (VOCAB_SIZE, DIM_EMB, NUM_CLASSES)
        self.device = device
        
        n1 = 200
        self.embedding = nn.Embedding(VOCAB_SIZE, DIM_EMB)
        self.conv1_1 = nn.Conv1d(DIM_EMB, n1, 1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.act = nn.ReLU()
        self.fc = nn.Linear(n1, NUM_CLASSES)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, X, train=False):
        x = self.embedding(X)
        x = x.t()
        # conv input: (batch_size, dim_emb, review_length)
        x = x.view(1, x.shape[0], x.shape[1])
        x = self.conv1_1(x)
        # ccat output: (batch_size, dim_emb, review_length)
        x = self.pool(x)
        x = torch.flatten(x.view(x.shape[1], 1))
        x = self.act(x)
        x = self.fc(x)
        y = self.softmax(x)
        return y
    
def Eval(X, Y, mlp):
    num_correct = 0
    for i in range(len(X)):
        logProbs = mlp.forward(X[i], train=False)
        pred = torch.argmax(logProbs)
        onehot = np.zeros(2)
        onehot[pred] = 1.
        if (onehot == np.array(Y[i])).all():
            num_correct += 1
    return float(num_correct) / float(len(X))

def Train(X, Y, val_X, val_Y, vocab_size, n_iter):
    print("Start Training!")
    mlp = CNN(X, Y, vocab_size)
    if torch.cuda.is_available():
        mlp.cuda()
    optimizer = optim.Adam(mlp.parameters(), lr=0.001)
    batch_size = 1
    max_acc = 0.5
    for epoch in range(n_iter):
        print('-------------')
        print('Epoch %d' % epoch)
        start = time()
        total_loss = 0.0
        for i in range(0, len(X), batch_size):
            mlp.zero_grad()
            probs = mlp.forward(X[i])
            y_onehot = Y[i]
            loss = torch.neg(torch.log(probs)).dot(y_onehot)            
            total_loss += loss
            loss.backward()
            optimizer.step()
            if i % 10 == 0:
                p = ' %d/%d' % (i, len(X))
                printProgressBar(i+1, len(X), prefix=p, length=60)
        print('loss: %.4f' % total_loss)
        print('time: %.2fs' % (time() - start))
        train_acc = Eval(X, Y, mlp)
        val_acc = Eval(val_X, val_Y, mlp)
        print('train_acc: %.4f' % train_acc)
        print('val_acc: %.4f' % val_acc)
        if val_acc > max_acc:
            print('New best! Saving model.')
            torch.save(mlp.state_dict(), 'models/best-epoch%d.model' % epoch)
            max_acc = val_acc
    return mlp

In [None]:
limit = 100 # helpful for quick debugging
epochs = 2
mlp = Train(x_train[:limit], y_train[:limit], x_val, y_val, len(vocab), epochs)