In [19]:
# !pip3 install transformers
# !pip install datasets
# !pip3 install emoji

In [20]:
import random
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import transformers
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from transformers import get_linear_schedule_with_warmup, AdamW, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
# sys.path.insert(0, '..')
# from src.data_collection import get_data
from data_collection import get_data

In [21]:
manual_seed = 2022
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
device

device(type='cuda')

## RoBERTa

### 1. RoBERTa base model

RoBERTa is a pretrained model on English language using a masked language modeling (MLM).
- Paper [here](https://arxiv.org/abs/1907.11692).

Description from [huggingface](https://huggingface.co/roberta-base):

_RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts._

_More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then run the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence._

_This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard classifier using the features produced by the BERT model as inputs._

### 2. DistilRoBERTa base model

DistilRoBERTa is a distilled version (like DistilBERT) of RoBERTa.

DistilRoBERTa was trained on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/) and takes 4 times less training time than RoBERTa.


### 3. RoBERTa large

Large version of RoBERTa.

### 4. DistilRoBERTa finetuned on hate speech tweets 

[Link on Hugging Face](https://huggingface.co/mrm8488/distilroberta-finetuned-tweets-hate-speech)


## Data Preprocessing

We are using [ucberkeley-dlab_measuring-hate-speech](https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech) as our dataset. To learn more about it, go to notebooks/data_description.ipynb


_In our dataset, the labels are organized like this:_
- 0 : no
- 1 : yes

In [23]:
hate_speech_ucb = get_data()
hate_speech_ucb.head()

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!


Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present ...,0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0


1. Delete label - 1  (unclear) and keep only:
0 - no,
2 - yes
2. Change label 2 to 1, so to have two labels: 
0 - no,
1 - yes

In [24]:
hate_speech_ucb["hatespeech"].value_counts()

0    26608
1    12957
Name: hatespeech, dtype: int64

In [25]:
train_data, temp_data = train_test_split(hate_speech_ucb, test_size=0.2)
dev_data, test_data = train_test_split(temp_data, test_size=0.5)

In [26]:
print(f"Train data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")
print(f"Dev data size: {len(dev_data)}")

Train data size: 31652
Test data size: 3957
Dev data size: 3956


## Performance Metrics

We are using precision, recall, and F1-score to measure the performance of the model.

In [27]:
def calculate_metrics(preds, labels):
    results = dict()
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    results['precision_score'] = precision_score(labels_flat, preds_flat, average='binary')
    results['recall_score'] = recall_score(labels_flat, preds_flat, average='binary')
    results['f1_score'] = f1_score(labels_flat, preds_flat, average='binary')
    return results

## Loading Tokenizer and encoding data

In [28]:
def encode_data(df, tokenizer):
    input_ids = []
    attention_masks = []
    for tweet in df[["text"]].values:
        tweet = tweet.item()
        encoded_data = tokenizer.encode_plus(
                            tweet,                      
                            add_special_tokens = True,  
                            max_length = 128,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_ids': input_ids,
    'input_mask': attention_masks}
    return inputs

## Prepare DataLoaders

In [29]:
MODEL = "distilroberta-base"
batch_size = 8

def prepare_dataloaders(train_df, dev_df, test_df, model_name, batch_size):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, normalization=True)

    data_train = encode_data(train_df, tokenizer)
    labels_train = train_df.hatespeech.astype(int)

    data_valid = encode_data(dev_df, tokenizer)
    labels_valid = dev_df.hatespeech.astype(int)

    data_test = encode_data(test_df, tokenizer)

    input_ids, attention_masks = data_train.values()
    train_labels = torch.tensor(labels_train.values)
    train_dataset = TensorDataset(input_ids, attention_masks, train_labels)

    input_ids, attention_masks = data_valid.values()
    valid_labels = torch.tensor(labels_valid.values)
    val_dataset = TensorDataset(input_ids, attention_masks, valid_labels)

    input_ids, attention_masks = data_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )

    validation_dataloader = DataLoader(
                val_dataset, 
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size 
            )

    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )

    return train_dataloader, validation_dataloader, test_dataloader

## Setting up Pretrained BERTweet Model

In [31]:
def prepare_model(total_labels, model_name, model_to_load=None):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels = total_labels,  
        output_attentions = False, 
        output_hidden_states = False,
    )
    if model_to_load is not None:
        try:
            model.roberta.load_state_dict(torch.load(model_to_load))
            print("Loaded pre-trained model")
        except:
            pass
    return model

## Setting up Optimizer and Scheduler

In [32]:
def prepare_optimizer_scheduler(total_steps, learning_rate=5e-5):
    optimizer = AdamW(model.parameters(),
                    lr = learning_rate,
                    eps = 1e-8,
                    weight_decay = 1e-2
                    )
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps
                                                )
    return optimizer, scheduler

## Creating our Training Loop

In [34]:
def evaluate(model, validation_dataloader):
    
    model.eval()
    preds = []
    true_labels = []
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in tqdm(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].type(torch.LongTensor).to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
            
        preds.append(logits)
        true_labels.append(label_ids)
        
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    tqdm.write(f"Avg validation loss: {avg_val_loss}")

    return preds, true_labels, avg_val_loss

In [35]:
def train(model, optimizer, scheduler, train_dataloader, validation_dataloader, epochs):
    training_stats = []
    model.train()
    total_train_loss = 0

    for epoch in tqdm(range(1, epochs+1)):
        progress_bar = tqdm(train_dataloader, 
                        desc=" Epoch {:1d}".format(epoch),
                        leave=False, # to overwrite each epoch
                        disable=False)

        for batch in progress_bar:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].type(torch.LongTensor).to(device)

            model.zero_grad()
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)

            loss = outputs.loss
            logits = outputs.logits

            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        tqdm.write(f"\nEpoch: {epoch}")
        tqdm.write(f"Training loss: {avg_train_loss}")

        preds, val_labels, avg_val_loss = evaluate(model, validation_dataloader)
        predictions = np.concatenate(preds, axis=0)
        labels = np.concatenate(val_labels, axis=0)

        scores = calculate_metrics(predictions, labels)
        precision = scores['precision_score']*100
        recall = scores['recall_score']*100
        f1 = scores['f1_score']*100
        tqdm.write(f"Precision Score: {precision}")
        tqdm.write(f"Recall Score: {recall}")
        tqdm.write(f"F1 Score: {f1}")
    print("Training complete!")

### 1. RoBERTa

In [37]:
MODEL = "roberta-base"
BATCH_SIZE = 8

train_dataloader, validation_dataloader, test_dataloader = prepare_dataloaders(
    train_data, dev_data, test_data, model_name=MODEL, batch_size=BATCH_SIZE
)

EPOCHS = 1
NUM_LABELS = len(hate_speech_ucb.hatespeech.unique())
TOTAL_STEPS = len(train_dataloader) * EPOCHS
model = prepare_model(total_labels=NUM_LABELS, model_name=MODEL, model_to_load=None)
model.to(device)
optimizer, scheduler = prepare_optimizer_scheduler(
    total_steps=TOTAL_STEPS, learning_rate=5e-5
)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [38]:
train(model, optimizer, scheduler, train_dataloader, validation_dataloader, EPOCHS)

  0%|          | 0/1 [12:08<?, ?it/s]


Epoch: 1
Training loss: 0.508427280959496


100%|██████████| 495/495 [00:24<00:00, 20.46it/s]
100%|██████████| 1/1 [12:32<00:00, 752.60s/it]

Avg validation loss: 0.45553952923627816
Precision Score: 71.80616740088107
Recall Score: 63.57254290171607
F1 Score: 67.43897393462971
Training complete!





### 2. DistilRoBERTa

In [69]:
MODEL = "distilroberta-base"
BATCH_SIZE = 64

train_dataloader, validation_dataloader, test_dataloader = prepare_dataloaders(
    train_data, dev_data, test_data, model_name=MODEL, batch_size=BATCH_SIZE
)

In [70]:
EPOCHS = 1
LEARNING_RATE = 5e-5
NUM_LABELS = len(hate_speech_ucb.hatespeech.unique())
TOTAL_STEPS = len(train_dataloader) * EPOCHS
model = prepare_model(total_labels=NUM_LABELS, model_name=MODEL, model_to_load=None)
model.to(device)
optimizer, scheduler = prepare_optimizer_scheduler(
    total_steps=TOTAL_STEPS, learning_rate=LEARNING_RATE
)


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

In [71]:
train(model, optimizer, scheduler, train_dataloader, validation_dataloader, EPOCHS)

  0%|          | 0/1 [03:49<?, ?it/s]


Epoch: 1
Training loss: 0.4642020076212257


100%|██████████| 62/62 [00:09<00:00,  6.27it/s]
100%|██████████| 1/1 [03:59<00:00, 239.34s/it]

Avg validation loss: 0.42029443046739023
Precision Score: 74.81060606060606
Recall Score: 61.62246489859594
F1 Score: 67.57912745936697
Training complete!





In [72]:
from sklearn.metrics import classification_report

preds, val_labels, avg_val_loss = evaluate(model, validation_dataloader)
predictions = np.concatenate(preds, axis=0)
labels = np.concatenate(val_labels, axis=0)

print(classification_report(labels, predictions.argmax(axis=-1)))

100%|██████████| 62/62 [00:09<00:00,  6.31it/s]

Avg validation loss: 0.42029443046739023
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      2674
           1       0.75      0.62      0.68      1282

    accuracy                           0.81      3956
   macro avg       0.79      0.76      0.77      3956
weighted avg       0.80      0.81      0.80      3956






### 3. RoBERTa large

In [39]:
MODEL = "roberta-large"
BATCH_SIZE = 8

train_dataloader, validation_dataloader, test_dataloader = prepare_dataloaders(
    train_data, dev_data, test_data, model_name=MODEL, batch_size=BATCH_SIZE
)

EPOCHS = 1
NUM_LABELS = len(hate_speech_ucb.hatespeech.unique())
TOTAL_STEPS = len(train_dataloader) * EPOCHS
model = prepare_model(total_labels=NUM_LABELS, model_name=MODEL, model_to_load=None)
model.to(device)
optimizer, scheduler = prepare_optimizer_scheduler(
    total_steps=TOTAL_STEPS, learning_rate=5e-5
)


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

In [40]:
train(model, optimizer, scheduler, train_dataloader, validation_dataloader, EPOCHS)

  0%|          | 0/1 [35:17<?, ?it/s]


Epoch: 1
Training loss: 0.6447691438827425


100%|██████████| 495/495 [01:06<00:00,  7.41it/s]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 1/1 [36:23<00:00, 2183.94s/it]

Avg validation loss: 0.641249378763064
Precision Score: 0.0
Recall Score: 0.0
F1 Score: 0.0
Training complete!





#### 4 - DistilRoBERTa finetuned on hate speech tweets

In [45]:
MODEL = "mrm8488/distilroberta-finetuned-tweets-hate-speech"
BATCH_SIZE = 64

train_dataloader, validation_dataloader, test_dataloader = prepare_dataloaders(
    train_data, dev_data, test_data, model_name=MODEL, batch_size=BATCH_SIZE
)

In [46]:
EPOCHS = 1
NUM_LABELS = len(hate_speech_ucb.hatespeech.unique())
TOTAL_STEPS = len(train_dataloader) * EPOCHS
model = prepare_model(total_labels=NUM_LABELS, model_name=MODEL, model_to_load=None)
model.to(device)
optimizer, scheduler = prepare_optimizer_scheduler(
    total_steps=TOTAL_STEPS, learning_rate=5e-5
)

train(model, optimizer, scheduler, train_dataloader, validation_dataloader, EPOCHS)

  0%|          | 0/1 [03:49<?, ?it/s]


Epoch: 1
Training loss: 0.46911412477493286


100%|██████████| 62/62 [00:09<00:00,  6.29it/s]
100%|██████████| 1/1 [03:59<00:00, 239.64s/it]

Avg validation loss: 0.42139427556145576
Precision Score: 72.97297297297297
Recall Score: 63.18252730109204
F1 Score: 67.7257525083612
Training complete!



