In [1]:
# !pip3 install transformers
# !pip install datasets
# !pip3 install emoji

In [2]:
import random
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.optim import AdamW
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import transformers
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from transformers import get_linear_schedule_with_warmup, AdamW, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
sys.path.insert(0, '..')
from src.data_collection import get_data

In [3]:
manual_seed = 2022
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## BERTweet

### 1. bertweet-base model

BERTweet: A pre-trained language model for English Tweets
BERTweet is the first public large-scale language model pre-trained for English Tweets. BERTweet is trained based on the RoBERTa pre-training procedure. The corpus used to pre-train BERTweet consists of 850M English Tweets (16B word tokens ~ 80GB), containing 845M Tweets streamed from 01/2012 to 08/2019 and 5M Tweets related to the COVID-19 pandemic. The general architecture and experimental results of BERTweet can be found in our paper: https://huggingface.co/vinai/bertweet-base

Cited from https://github.com/VinAIResearch/BERTweet 

"Normalize raw input tweets

Before applying BPE to the pre-training corpus of English Tweets, we tokenized these Tweets using TweetTokenizer from the NLTK toolkit and used **the emoji package to translate emotion icons into text strings** (here, each icon is referred to as a word token). 

We also normalized the Tweets by **converting user mentions and web/url links into special tokens @USER and HTTPURL**, respectively. Thus it is recommended to also apply the same pre-processing step for BERTweet-based downstream applications w.r.t. the raw input Tweets.

BERTweet provides this pre-processing step by enabling the normalization argument of its tokenizer. `tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)`"


## Data Preprocessing

We are using [ucberkeley-dlab_measuring-hate-speech](https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech) as our dataset. To learn more about it, go to notebooks/data_description.ipynb


_In our dataset, the labels are organized like this:_
- 0 : no
- 1 : yes

In [4]:
hate_speech_ucb = get_data()
hate_speech_ucb.head()

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!


Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present ...,0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0


1. Delete label - 1  (unclear) and keep only:
0 - no,
2 - yes
2. Change label 2 to 1, so to have two labels: 
0 - no,
1 - yes

In [5]:
hate_speech_ucb["hatespeech"].value_counts()

0    26608
1    12957
Name: hatespeech, dtype: int64

In [6]:
train_data, temp_data = train_test_split(hate_speech_ucb, test_size=0.2)
dev_data, test_data = train_test_split(temp_data, test_size=0.5)

In [7]:
print(f"Train data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")
print(f"Dev data size: {len(dev_data)}")

Train data size: 31652
Test data size: 3957
Dev data size: 3956


## Performance Metrics

We are using precision, recall, and F1-score to measure the performance of the model.

In [8]:
def calculate_metrics(preds, labels):
    results = dict()
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    results['precision_score'] = precision_score(labels_flat, preds_flat, average='binary')
    results['recall_score'] = recall_score(labels_flat, preds_flat, average='binary')
    results['f1_score'] = f1_score(labels_flat, preds_flat, average='binary')
    return results

## Loading Tokenizer and encoding data

In [9]:
def encode_data(df, tokenizer):
    input_ids = []
    attention_masks = []
    for tweet in df[["text"]].values:
        tweet = tweet.item()
        encoded_data = tokenizer.encode_plus(
                            tweet,                      
                            add_special_tokens = True,  
                            max_length = 128,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_ids': input_ids,
    'input_mask': attention_masks}
    return inputs

## Prepare DataLoaders

In [10]:
MODEL = "vinai/bertweet-large"
batch_size = 16

def prepare_dataloaders(train_df, dev_df, test_df, model_name, batch_size):
    tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False, normalization=True)

    data_train = encode_data(train_df, tokenizer)
    labels_train = train_df.hatespeech.astype(int)

    data_valid = encode_data(dev_df, tokenizer)
    labels_valid = dev_df.hatespeech.astype(int)

    data_test = encode_data(test_df, tokenizer)

    input_ids, attention_masks = data_train.values()
    train_labels = torch.tensor(labels_train.values)
    train_dataset = TensorDataset(input_ids, attention_masks, train_labels)

    input_ids, attention_masks = data_valid.values()
    valid_labels = torch.tensor(labels_valid.values)
    val_dataset = TensorDataset(input_ids, attention_masks, valid_labels)

    input_ids, attention_masks = data_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )

    validation_dataloader = DataLoader(
                val_dataset, 
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size 
            )

    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )

    return train_dataloader, validation_dataloader, test_dataloader

In [11]:
train_dataloader, validation_dataloader, test_dataloader = prepare_dataloaders(train_data, dev_data, test_data, model_name=MODEL, batch_size = batch_size)

## Setting up Pretrained BERTweet Model

In [12]:
def prepare_model(total_labels, model_name, model_to_load=None):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels = total_labels,  
        output_attentions = False, 
        output_hidden_states = False,
    )
    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("Loaded pre-trained model")
        except:
            pass
    return model

## Setting up Optimizer and Scheduler

In [13]:
def prepare_optimizer_scheduler(total_steps, learning_rate=1e-5):
    optimizer = AdamW(model.parameters(),
                    lr = learning_rate,
                    eps = 1e-8,
                    weight_decay = 1e-2
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps
                                                )
    return optimizer, scheduler

## Creating our Training Loop

In [14]:
EPOCHS = 1
NUM_LABELS = len(hate_speech_ucb.hatespeech.unique())
TOTAL_STEPS = len(train_dataloader) * EPOCHS
model = prepare_model(total_labels=NUM_LABELS, model_name=MODEL, model_to_load=None)
model.to(device)
optimizer, scheduler = prepare_optimizer_scheduler(total_steps=TOTAL_STEPS, learning_rate=1e-5)

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 

In [15]:
def evaluate(model, validation_dataloader):
    
    model.eval()
    preds = []
    true_labels = []
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in tqdm(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].type(torch.LongTensor).to(device)

        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
            
        preds.append(logits)
        true_labels.append(label_ids)
        
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    tqdm.write(f"Avg validation loss: {avg_val_loss}")

    return preds, true_labels

In [16]:
def train(model, optimizer, scheduler, train_dataloader, validation_dataloader, epochs):
    training_stats = []
    model.train()
    total_train_loss = 0

    for epoch in tqdm(range(1, epochs+1)):
        progress_bar = tqdm(train_dataloader, 
                        desc=" Epoch {:1d}".format(epoch),
                        leave=False, # to overwrite each epoch
                        disable=False)

        for batch in progress_bar:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].type(torch.LongTensor).to(device)

            model.zero_grad()
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)

            loss = outputs.loss
            logits = outputs.logits

  

            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        tqdm.write(f"\nEpoch: {epoch}")
        tqdm.write(f"Training loss: {avg_train_loss}")

        preds, val_labels = evaluate(model, validation_dataloader)
        predictions = np.argmax(np.concatenate(preds, axis=0), axis=1).flatten()
        labels = (np.concatenate(val_labels, axis=0)).flatten()
        
        print(classification_report(labels, predictions))
        torch.save(model.state_dict(), f'./drive/MyDrive/585_project/bertweet_large_weights.pt')

    print("Training complete!")

In [17]:
train(model, optimizer, scheduler, train_dataloader, validation_dataloader, EPOCHS)

  0%|          | 0/1 [00:00<?, ?it/s]
 Epoch 1:   0%|          | 0/1979 [00:00<?, ?it/s][A
 Epoch 1:   0%|          | 1/1979 [00:01<33:20,  1.01s/it][A
 Epoch 1:   0%|          | 2/1979 [00:01<28:00,  1.18it/s][A
 Epoch 1:   0%|          | 3/1979 [00:02<26:16,  1.25it/s][A
 Epoch 1:   0%|          | 4/1979 [00:03<25:26,  1.29it/s][A
 Epoch 1:   0%|          | 5/1979 [00:03<24:59,  1.32it/s][A
 Epoch 1:   0%|          | 6/1979 [00:04<24:42,  1.33it/s][A
 Epoch 1:   0%|          | 7/1979 [00:05<24:31,  1.34it/s][A
 Epoch 1:   0%|          | 8/1979 [00:06<24:23,  1.35it/s][A
 Epoch 1:   0%|          | 9/1979 [00:06<24:19,  1.35it/s][A
 Epoch 1:   1%|          | 10/1979 [00:07<24:15,  1.35it/s][A
 Epoch 1:   1%|          | 11/1979 [00:08<24:12,  1.35it/s][A
 Epoch 1:   1%|          | 12/1979 [00:09<24:10,  1.36it/s][A
 Epoch 1:   1%|          | 13/1979 [00:09<24:09,  1.36it/s][A
 Epoch 1:   1%|          | 14/1979 [00:10<24:07,  1.36it/s][A
 Epoch 1:   1%|          | 15/1979 


Epoch: 1
Training loss: 0.4706744430506693



  0%|          | 0/248 [00:00<?, ?it/s][A
  0%|          | 1/248 [00:00<00:58,  4.25it/s][A
  1%|          | 2/248 [00:00<00:59,  4.16it/s][A
  1%|          | 3/248 [00:00<00:59,  4.14it/s][A
  2%|▏         | 4/248 [00:00<01:00,  4.06it/s][A
  2%|▏         | 5/248 [00:01<00:59,  4.08it/s][A
  2%|▏         | 6/248 [00:01<00:59,  4.07it/s][A
  3%|▎         | 7/248 [00:01<00:59,  4.07it/s][A
  3%|▎         | 8/248 [00:01<00:59,  4.07it/s][A
  4%|▎         | 9/248 [00:02<00:59,  4.04it/s][A
  4%|▍         | 10/248 [00:02<00:58,  4.05it/s][A
  4%|▍         | 11/248 [00:02<00:58,  4.05it/s][A
  5%|▍         | 12/248 [00:02<00:58,  4.06it/s][A
  5%|▌         | 13/248 [00:03<00:58,  4.03it/s][A
  6%|▌         | 14/248 [00:03<00:57,  4.05it/s][A
  6%|▌         | 15/248 [00:03<00:57,  4.04it/s][A
  6%|▋         | 16/248 [00:03<00:57,  4.05it/s][A
  7%|▋         | 17/248 [00:04<00:56,  4.06it/s][A
  7%|▋         | 18/248 [00:04<00:56,  4.07it/s][A
  8%|▊         | 19/248 [00:0

Avg validation loss: 0.42255010182458547
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      2621
           1       0.77      0.62      0.69      1335

    accuracy                           0.81      3956
   macro avg       0.80      0.76      0.77      3956
weighted avg       0.81      0.81      0.80      3956

Training complete!





In [None]:
# load weights of the best model
path = './drive/MyDrive/585_project/bertweet_large_weights.pt'
model.load_state_dict(torch.load(path))