## Utilities

In [None]:
from transformers import *
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import codecs
from nltk.corpus import stopwords
import string
from scipy import sparse
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from math import log
import operator

import torch 
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import math
import spacy
torch.manual_seed(123)

import random
random.seed(123)

np.random.seed(123)
nlp = spacy.load("en_core_web_sm")

device = torch.device('cuda')

import sys
import os

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import codecs
import string
from scipy import sparse
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import math
import torch.utils.data as data_utils
import transformers

## Load Model

In [None]:
model_class, tokenizer_class, config_class, pretrained_weights = BertModel, BertTokenizer, BertConfig, 'bert-base-uncased'

In [None]:
pretrained_weights = 'bert-base-uncased'
tokenizer = tokenizer_class.from_pretrained(pretrained_weights,
                                            do_lower_case=True)

## Data

In [None]:
def load_data(path, avg):
    df = pd.read_table(path, sep = '\t')
    df['label'] = np.where(df['average'] > avg, 1, 0)
    x = df['text']
    y = df['label']
    from sklearn.model_selection import train_test_split
    corpus, dev_corpus, y, dev_y = train_test_split(x, y, test_size = 0.2, random_state = 42)
    return corpus.values, y.values, dev_corpus.values, dev_y.values

In [None]:
path = 'task_a_distant.tsv'
avg = 0.5
corpus, y, dev_corpus, dev_y = load_data(df, avg)

In [None]:
corpus_tokenized = corpus
dev_corpus_tokenized = dev_corpus
for i in range(len(corpus)):
    try:
        corpus_tokenized[i] = tokenizer.tokenize(corpus[i])
        if (i%10000 == 0):
            print (i, len(corpus))
    except:
        print (i)
        pass
    
for i in range(len(dev_corpus)):
    try:
        dev_corpus_tokenized[i] = tokenizer.tokenize(dev_corpus[i])
        if (i%10000 == 0):
            print (i, len(dev_corpus))
    except:
        print (i)
        pass

In [None]:
length_ = 0
for i in range(10000):
    length_ += len(corpus_tokenized[i])

print ("Average Length: ", length_/10000)

In [None]:
def vectorize(corpus):
    input_ids_list = []
    segment_ids_list = []
    input_mask_list = []
    max_seq_length = 64
    for i in range(len(corpus)):
        to_append = ["[CLS]"] + corpus[i][:max_seq_length-2] + ["[SEP]"] 
        segment_ids = [0] * (len(corpus[i][:max_seq_length-2]) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert (len(input_ids) == max_seq_length)
        assert (len(input_mask) == max_seq_length)
        assert (len(segment_ids) == max_seq_length)
        input_ids_list.append(input_ids)
        segment_ids_list.append(segment_ids)
        input_mask_list.append(input_mask)

    return input_ids_list, segment_ids_list, input_mask_list

In [None]:
def vectorize_dev(corpus):
    input_ids_list = []
    segment_ids_list = []
    input_mask_list = []
    max_seq_length = 64
    for i in range(len(corpus)):
        to_append = ["[CLS]"] + corpus[i][:max_seq_length-2] + ["[SEP]"] 
        segment_ids = [0] * (len(corpus[i][:max_seq_length-2]) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert (len(input_ids) == max_seq_length)
        assert (len(input_mask) == max_seq_length)
        assert (len(segment_ids) == max_seq_length)
        input_ids_list.append(input_ids)
        segment_ids_list.append(segment_ids)
        input_mask_list.append(input_mask)

    return input_ids_list, segment_ids_list, input_mask_list

In [None]:
input_ids_list, segment_ids_list, input_mask_list = vectorize(corpus_tokenized)
input_ids_list2, segment_ids_list2, input_mask_list2 = vectorize_dev(dev_corpus_tokenized)

In [None]:
y = np.array(y)
dev_y = np.array(dev_y)

y = y[..., np.newaxis]
dev_y = dev_y[..., np.newaxis]

In [None]:
input_ids_list, segment_ids_list, input_mask_list = np.array(input_ids_list), np.array(segment_ids_list), np.array(input_mask_list)
input_ids_list2, segment_ids_list2, input_mask_list2 = np.array(input_ids_list2), np.array(segment_ids_list2), np.array(input_mask_list2)

train_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list).to(device), torch.from_numpy(segment_ids_list).to(device), torch.from_numpy(input_mask_list).to(device), torch.from_numpy(y_new).to(device))
train_loader = data_utils.DataLoader(
    train_dset,
    batch_size=32, shuffle = True
)

val_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list2).to(device), torch.from_numpy(segment_ids_list2).to(device), torch.from_numpy(input_mask_list2).to(device), torch.from_numpy(dev_y_new).to(device))
val_loader = data_utils.DataLoader(
    val_dset,
    batch_size=32
)

## Model

In [None]:
class Classifier(nn.Module):
    def __init__(self, bert_model, dropout_p = 0.1):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_p)
        self.classifier = nn.Linear(768,2)
        self.classifier.weight.data.normal_(mean=0.0, std=0.02)        

    def forward(self, input_ids, segment_ids, input_mask, embeds_only = False, embeds = None):
        x1 = embeds
        if not embeds_only:
            x1 = self.bert(input_ids, token_type_ids = segment_ids, attention_mask = input_mask)[0]
            x1 = x1[:,0]
        drop_x1 = self.dropout(x1)
        y = self.classifier(drop_x1) # return logits
        y  = torch.log_softmax(y, dim = 1)
        return y, x1

## Training

In [None]:
def train_epoch(model, dataloader, loss_function, optimizer, epoch_num):
    from sklearn.metrics import f1_score
    model.train() 
    avg_loss = 0.0
    count = 0
    truth_res = []
    pred_res = []
    pred_probs_list_train = []
    for input_ids, segment_ids, input_mask, label in dataloader:
        input_ids, segment_ids, input_mask, label = input_ids.to("cuda"), segment_ids.to("cuda"), input_mask.to("cuda"), label.to("cuda")
        model.to(device)
        pred = model(input_ids, segment_ids, input_mask)
        pred_prob = pred[:, 1].detach().data.cpu().numpy()
        pred_probs_list_train.append(np.exp(pred_prob))
        model.zero_grad()
        loss = loss_function(pred, label.view(-1))
        loss.backward()
        optimizer.step()
        pred_label = pred.data.max(1)[1].cpu()
        pred_res += [pred_label]
        truth_res += [label.detach().data.cpu()]
        avg_loss += loss.detach().data.item()
        count += 1
        if count % 5000 == 0:
            print('[TRAIN] epoch: %d iterations: %d loss :%g' % (epoch_num, count, loss.detach().data.item()))

    avg_loss /= len(input_ids_list)
    print('[TRAIN] epoch: %d done! \n train avg_loss:%g , f1:%g'%(epoch_num, avg_loss, f1_score(torch.cat(truth_res),torch.cat(pred_res), average = 'macro')))

In [None]:
def eval_epoch(model, dataloader, loss_function, optimizer, epoch_num):
    from sklearn.metrics import f1_score
    model.eval()
    #avg_loss = 0.0
    count = 0
    truth_res = []
    pred_res = []
    pred_probs_list = []
    for input_ids, segment_ids, input_mask in dataloader:
        input_ids, segment_ids, input_mask = input_ids.to("cuda"), segment_ids.to("cuda"), input_mask.to("cuda")
        model.to(device)
        pred = model(input_ids, segment_ids, input_mask)
        #loss = loss_function(pred, label.view(-1))
        pred_prob = pred[:, 1].detach().data.cpu().numpy()
        pred_probs_list.append(np.exp(pred_prob))
        #pred_probs = np.argmax(pred_probs, axis=1)
        #pred_probs_list += [pred_probs]
        pred_label = pred.data.max(1)[1].cpu()
        pred_res += [pred_label]
        #truth_res += [label.detach().data.cpu()]
        #avg_loss += loss.detach().data.item()
        #count += 1
    #avg_loss /= len(input_ids_list)
    print('[EVAL] epoch: %d done!'%(epoch_num))
    return pred_probs_list, pred_res

In [None]:
EPOCHS = 10
epoch = 0
for epoch in range(EPOCHS):
    train_epoch(model, train_loader, loss_function, optimizer, epoch)
    pred_probs_list, pred_res = eval_epoch(model, val_loader, loss_function, optimizer, epoch)

## Testing 

In [None]:
test_data = pd.read_table('/rapids/notebooks/atcdata/SEV T12/test_data/offenseval-tr-testset-v1.tsv', sep = '\t')
test_corpus = test_data['tweet'].values

In [None]:
test_corpus_tokenized = test_corpus
for i in range(len(test_corpus)):
    try:
        test_corpus_tokenized[i] = tokenizer.tokenize(test_corpus[i])
    except:
        pass

In [None]:
def vectorize_test(corpus):
    input_ids_list = []
    segment_ids_list = []
    input_mask_list = []
    max_seq_length = 512
    for i in range(len(corpus)):
        to_append = ["[CLS]"] + corpus[i] + ["[SEP]"] 
        segment_ids = [0] * (len(corpus[i]) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert (len(input_ids) == max_seq_length)
        assert (len(input_mask) == max_seq_length)
        assert (len(segment_ids) == max_seq_length)
        input_ids_list.append(input_ids)
        segment_ids_list.append(segment_ids)
        input_mask_list.append(input_mask)
        

    return input_ids_list, segment_ids_list, input_mask_list

In [None]:
input_ids_list_test, segment_ids_list_test, input_mask_list_test = vectorize_test(test_corpus_tokenized) 
input_ids_list_test, segment_ids_list_test, input_mask_list_test = np.array(input_ids_list_test), np.array(segment_ids_list_test), np.array(input_mask_list_test)

In [None]:
test_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list_test).to(device), torch.from_numpy(segment_ids_list_test).to(device), torch.from_numpy(input_mask_list_test).to(device))
test_loader = data_utils.DataLoader(
    test_dset
)

In [None]:
test_model = ClassifierDBert(bert_model, 0.6)
test_model.load_state_dict(torch.load('bert_simple_1.pth'))

In [None]:
def test_epoch(model, dataloader, optimizer):
    from sklearn.metrics import f1_score
    model.eval()
    count = 0
    truth_res = []
    pred_res = []
    for input_ids, segment_ids, input_mask in dataloader:
        input_ids, segment_ids, input_mask = input_ids.to("cuda"), segment_ids.to("cuda"), input_mask.to("cuda")
        model.to(device)
        pred = model(input_ids, segment_ids, input_mask)
        pred_label = pred.data.max(1)[1].cpu()
        pred_res += [pred_label]
    return pred_res

In [None]:
pred = test_epoch(test_model, test_loader, optimizer)