# Preprocessing

In [None]:
import os
import glob
import csv
import re

In [None]:
# %% bash
# set -e
# CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"
# javac -cp $CLASSPATH lib/*.java

In [None]:
def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def make_dirs(dirs):
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)
            
def dependency_parse(filepath, cp='', tokenize=True):
    print('\nDependency parsing ' + filepath)
    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    tokpath = os.path.join(dirpath, filepre + '.toks')
    parentpath = os.path.join(dirpath, filepre + '.parents')
    relpath = os.path.join(dirpath, filepre + '.rels')
    tokenize_flag = '-tokenize - ' if tokenize else ''
    cmd = ('java -cp %s DependencyParse -tokpath %s -parentpath %s -relpath %s %s < %s'
           % (cp, tokpath, parentpath, relpath, tokenize_flag, filepath))
    os.system(cmd)


def constituency_parse(filepath, cp='', tokenize=True):
    print('\nConst parsing ' + filepath)
    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    tokpath = os.path.join(dirpath, filepre + '.toks')
    parentpath = os.path.join(dirpath, filepre + '.cparents')
    tokenize_flag = '-tokenize - ' if tokenize else ''
    cmd = ('java -cp %s ConstituencyParse -tokpath %s -parentpath %s %s < %s'
           % (cp, tokpath, parentpath, tokenize_flag, filepath))
    os.system(cmd)


def build_vocab(filepaths, dst_path, lowercase=True):
    vocab = set()
    for filepath in filepaths:
        with open(filepath) as f:
            for line in f:
                if lowercase:
                    line = line.lower()
                vocab |= set(line.split())
    with open(dst_path, 'w') as f:
        for w in sorted(vocab):
            f.write(w + '\n')


def split(filepath, dst_dir):
    with open(filepath) as datafile, \
            open(os.path.join(dst_dir, 'tweets.txt'), 'w') as tfile, \
            open(os.path.join(dst_dir, 'labels.txt'), 'w') as lfile:
        
        for line in csv.reader(datafile, delimiter=','):
            label = line[-2]
            tweet = line[-1].strip()
            tweet = removePattern(tweet, "@[\w]*")
            tweet = tweet.replace("#", "") # Removing '#' from hashtags
            tweet = re.sub("[^a-zA-Z]+", " ", tweet) # Removing punctuation and special characters
            tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"URL", tweet)
            tweet = re.sub(" +", " ", tweet)
            tweet = tweet.strip().lower()
            if len(tweet) > 1:
                tfile.write(tweet + "\n")
                lfile.write(label + "\n")


def parse(dirpath, cp=''):
    dependency_parse(os.path.join(dirpath, 'tweets.txt'), cp=cp, tokenize=True)
    constituency_parse(os.path.join(dirpath, 'tweets.txt'), cp=cp, tokenize=True)

print('=' * 80)
print('Preprocessing dataset')
print('=' * 80)

base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
data_dir = os.path.join(base_dir, 'data')
#sick_dir = os.path.join(data_dir, 'sick')
hate_dir = os.path.join(data_dir, 'hate-speech')
lib_dir = os.path.join(base_dir, 'lib')
train_dir = os.path.join(hate_dir, 'train')
dev_dir = os.path.join(hate_dir, 'dev')
test_dir = os.path.join(hate_dir, 'test')
make_dirs([train_dir, dev_dir, test_dir])

# java classpath for calling Stanford parser
classpath = ':'.join([
    lib_dir,
    os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
    os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

# split into separate files
split(os.path.join(hate_dir, 'hate_train.txt'), train_dir)
split(os.path.join(hate_dir, 'hate_dev.txt'), dev_dir)
split(os.path.join(hate_dir, 'hate_test.txt'), test_dir)

# parse sentences
parse(train_dir, cp=classpath)
parse(dev_dir, cp=classpath)
parse(test_dir, cp=classpath)

# get vocabulary
build_vocab(
    glob.glob(os.path.join(hate_dir, '*/*.toks')),
    os.path.join(hate_dir, 'vocab.txt'))
build_vocab(
    glob.glob(os.path.join(hate_dir, '*/*.toks')),
    os.path.join(hate_dir, 'vocab-cased.txt'),
    lowercase=False)

# Tree LSTM

In [1]:
import os
import random
import logging

import torch
import torch.nn as nn
import torch.optim as optim

# IMPORT CONSTANTS
from treelstm import Constants
# NEURAL NETWORK MODULES/LAYERS
from treelstm import SimilarityTreeLSTM
# DATA HANDLING CLASSES
from treelstm import Vocab
# DATASET CLASS FOR hate DATASET
from treelstm import HATEDataset
# METRICS CLASS FOR EVALUATION
from treelstm import Metrics
# UTILITY FUNCTIONS
from treelstm import utils
# TRAIN AND TEST HELPER FUNCTIONS
from treelstm import Trainer
# CONFIG PARSER
# from config import parse_args

In [14]:
class Args:
    data = "../../treelstm.pytorch/data/hate-speech"
    glove = "../../treelstm.pytorch/data/glove" 
    save = "checkpoints/"
    expname = "test"
    input_dim = 300
    mem_dim = 150
    hidden_dim = 250
    num_classes = 3
    freeze_embed = True
    epochs = 3
    batchsize = 64
    lr = 0.01
    wd = 1e-4
    sparse = False
    optim = "adagrad"
    seed = 69
    sparse = False
    cuda = True
args = Args()

In [15]:
# MAIN BLOCK
# args = parse_args()
# global logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
# file logger
fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
# console logger
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
# argument validation
args.cuda = args.cuda and torch.cuda.is_available()
device = torch.device("cuda:0" if args.cuda else "cpu")
if args.sparse and args.wd != 0:
    logger.error('Sparsity and weight decay are incompatible, pick one!')
    exit()
logger.debug(args)
torch.manual_seed(args.seed)
random.seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = True
if not os.path.exists(args.save):
    os.makedirs(args.save)

train_dir = os.path.join(args.data, 'train/')
dev_dir = os.path.join(args.data, 'dev/')
test_dir = os.path.join(args.data, 'test/')

# write unique words from all token files
hate_vocab_file = os.path.join(args.data, 'hate.vocab')
if not os.path.isfile(hate_vocab_file):
    #token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]]
    #token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]]
    token_files = [os.path.join(split, 'tweets.toks') for split in [train_dir, dev_dir, test_dir]]
    #token_files = token_files_a + token_files_b
    hate_vocab_file = os.path.join(args.data, 'hate.vocab')
    utils.build_vocab(token_files, hate_vocab_file)

# get vocab object from vocab file previously written
vocab = Vocab(filename=hate_vocab_file,
              data=[Constants.PAD_WORD, Constants.UNK_WORD,
                    Constants.BOS_WORD, Constants.EOS_WORD])
logger.debug('==> HATE vocabulary size : %d ' % vocab.size())

# load HATE dataset splits
train_file = os.path.join(args.data, 'hate_train.pth')
if os.path.isfile(train_file):
    train_dataset = torch.load(train_file)
else:
    train_dataset = HATEDataset(train_dir, vocab, args.num_classes)
    torch.save(train_dataset, train_file)
logger.debug('==> Size of train data   : %d ' % len(train_dataset))
dev_file = os.path.join(args.data, 'hate_dev.pth')
if os.path.isfile(dev_file):
    dev_dataset = torch.load(dev_file)
else:
    dev_dataset = HATEDataset(dev_dir, vocab, args.num_classes)
    torch.save(dev_dataset, dev_file)
logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
test_file = os.path.join(args.data, 'hate_test.pth')
if os.path.isfile(test_file):
    test_dataset = torch.load(test_file)
else:
    test_dataset = HATEDataset(test_dir, vocab, args.num_classes)
    torch.save(test_dataset, test_file)
logger.debug('==> Size of test data    : %d ' % len(test_dataset))

# initialize model, criterion/loss_function, optimizer
model = SimilarityTreeLSTM(
    vocab.size(),
    args.input_dim,
    args.mem_dim,
    args.hidden_dim,
    args.num_classes,
    args.sparse,
    args.freeze_embed)
#criterion = nn.KLDivLoss()
criterion = nn.BCEWithLogitsLoss()

# for words common to dataset vocab and GLOVE, use GLOVE vectors
# for other words in dataset vocab, use random normal vectors
emb_file = os.path.join(args.data, 'hate_embed.pth')
if os.path.isfile(emb_file):
    emb = torch.load(emb_file)
else:
    # load glove embeddings and vocab
    glove_vocab, glove_emb = utils.load_word_vectors(
        os.path.join(args.glove, 'glove.840B.300d'))
    logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
    emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=device)
    emb.normal_(0, 0.05)
    # zero out the embeddings for padding and other special words if they are absent in vocab
    for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD,
                                Constants.BOS_WORD, Constants.EOS_WORD]):
        emb[idx].zero_()
    for word in vocab.labelToIdx.keys():
        if glove_vocab.getIndex(word):
            emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
    torch.save(emb, emb_file)
# plug these into embedding matrix inside model
model.emb.weight.data.copy_(emb)

model.to(device), criterion.to(device)
if args.optim == 'adam':
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()), lr=args.lr, weight_decay=args.wd)
elif args.optim == 'adagrad':
    optimizer = optim.Adagrad(filter(lambda p: p.requires_grad,
                                     model.parameters()), lr=args.lr, weight_decay=args.wd)
elif args.optim == 'sgd':
    optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                 model.parameters()), lr=args.lr, weight_decay=args.wd)
metrics = Metrics(args.num_classes)

# create trainer object for training and testing
trainer = Trainer(args, model, criterion, optimizer, device)

best = -float('inf')
for epoch in range(args.epochs):
    train_loss = trainer.train(train_dataset)
    train_loss, train_pred = trainer.test(train_dataset)
    dev_loss, dev_pred = trainer.test(dev_dataset)
    test_loss, test_pred = trainer.test(test_dataset)

    train_acc = metrics.accuracy(train_pred, train_dataset.labels)
    train_pearson = metrics.pearson(train_pred, train_dataset.labels)
    train_mse = metrics.mse(train_pred, train_dataset.labels)
    train_f1 = metrics.mse(train_pred, train_dataset.labels)
    logger.info('==> Epoch {}, Train \tLoss: {}\tAccuracy: {}\tPearson: {}\tMSE: {}\tF1: {}'.format(
        epoch, train_loss, train_acc, train_pearson, train_mse,train_f1))
    dev_acc = metrics.accuracy(dev_pred, dev_dataset.labels)
    dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
    dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
    logger.info('==> Epoch {}, Dev \tLoss: {}\tAccuracy: {}\tPearson: {}\tMSE: {}'.format(
        epoch, dev_loss, dev_acc, dev_pearson, dev_mse))
    test_acc = metrics.accuracy(test_pred, test_dataset.labels)
    test_pearson = metrics.pearson(test_pred, test_dataset.labels)
    test_mse = metrics.mse(test_pred, test_dataset.labels)
    test_rec = metrics.recall(test_pred, test_dataset.labels)
    test_f1 = metrics.f1(test_pred, test_dataset.labels)
    logger.info('==> Epoch {}, Test \tLoss: {}\tAccuracy: {}\tPearson: {}\tMSE: {}\t Recall: {}\t F1: {}'.format(
        epoch, test_loss, test_acc, test_pearson, test_mse, test_rec, test_f1))

#     if best < test_pearson:
#         best = test_pearson
#         checkpoint = {
#             'model': trainer.model.state_dict(),
#             'optim': trainer.optimizer,
#             'pearson': test_pearson, 'mse': test_mse,
#             'args': args, 'epoch': epoch
#         }
#         logger.debug('==> New optimum found, checkpointing everything now...')
#         torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))

[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,376] DEBUG:__main__:<__main__.Args object at 0x148681abd320>
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:50,413] DEBUG:__main__:==> HATE vocabulary size : 23845 
[2019-11-10 16:34:54,595] DEBUG:__main__:==> Size of train

[2019-11-10 16:55:12,060] INFO:__main__:==> Epoch 1, Dev 	Loss: 0.1426872403385311	Accuracy: 0.9262948207171314	Pearson: 0.7287521362304688	MSE: 0.10358566045761108
[2019-11-10 16:55:12,088] INFO:__main__:==> Epoch 1, Test 	Loss: 0.15159676437082098	Accuracy: 0.9106788408098452	Pearson: 0.6482273936271667	MSE: 0.11909487843513489	 Recall: 0.6413503605948032	 F1: 0.8968644035605734
[2019-11-10 16:55:12,088] INFO:__main__:==> Epoch 1, Test 	Loss: 0.15159676437082098	Accuracy: 0.9106788408098452	Pearson: 0.6482273936271667	MSE: 0.11909487843513489	 Recall: 0.6413503605948032	 F1: 0.8968644035605734
[2019-11-10 16:55:12,088] INFO:__main__:==> Epoch 1, Test 	Loss: 0.15159676437082098	Accuracy: 0.9106788408098452	Pearson: 0.6482273936271667	MSE: 0.11909487843513489	 Recall: 0.6413503605948032	 F1: 0.8968644035605734
[2019-11-10 16:55:12,088] INFO:__main__:==> Epoch 1, Test 	Loss: 0.15159676437082098	Accuracy: 0.9106788408098452	Pearson: 0.6482273936271667	MSE: 0.11909487843513489	 Recall: 0.