In [8]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    set_seed,
)

from scipy.stats import pearsonr, spearmanr

In [1]:
from datasets import load_dataset

# load data
ds = load_dataset("ucberkeley-dlab/measuring-hate-speech")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/135556 [00:00<?, ? examples/s]

In [2]:
# get as pandas dataframe
ds.set_format(type='pandas')

df = ds['train'][:]

In [7]:
df['hate_speech_score'].describe()

Unnamed: 0,hate_speech_score
count,135556.0
mean,-0.567428
std,2.380003
min,-8.34
25%,-2.33
50%,-0.34
75%,1.41
max,6.3


In [23]:
# create test and training data
df = df[["text", "hate_speech_score"]].dropna().reset_index(drop=True)
df.rename(columns={"hate_speech_score":"labels"}, inplace=True)

df["quantile"] = pd.qcut(
    df["labels"],
    q=10,
    labels=False,
    duplicates="drop"
)

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["quantile"]
)


train_df = train_df.drop(columns=["quantile"])
val_df = val_df.drop(columns=["quantile"])

train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
val_ds = Dataset.from_pandas(val_df[["text", "labels"]])

In [24]:
# tokenise
MODEL_NAME = "roberta-base"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
val_ds = val_ds.remove_columns(["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")


Map:   0%|          | 0/122000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13556 [00:00<?, ? examples/s]

In [25]:
# model config - attempt regression on hate speech score
class TransformerRegressor(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.regressor = nn.Linear(hidden, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        cls = outputs.last_hidden_state[:, 0, :]
        preds = self.regressor(cls).squeeze(-1)

        loss = None
        if labels is not None:
            loss_fn = nn.HuberLoss(delta=1.0)
            loss = loss_fn(preds, labels)

        return {"loss": loss, "logits": preds}


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()

    return {
        "mae": np.mean(np.abs(preds - labels)),
        "rmse": np.sqrt(np.mean((preds - labels) ** 2)),
        "pearson": pearsonr(preds, labels)[0],
        "spearman": spearmanr(preds, labels)[0],
    }


training_args = TrainingArguments(
    output_dir="./mhs_regression",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    logging_steps=100,
    report_to="none",
)


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


In [None]:
# train
model = TransformerRegressor(MODEL_NAME)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss


In [None]:
def predict(texts):
    model.eval()
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
    )

    with torch.no_grad():
        preds = model(**inputs)["logits"]

    # de-standardize
    preds = preds * label_std + label_mean
    return preds.cpu().numpy()