In [8]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    set_seed,
)

from scipy.stats import pearsonr, spearmanr

In [32]:
from datasets import load_dataset

# load data
ds = load_dataset("ucberkeley-dlab/measuring-hate-speech")

In [33]:
# get as pandas dataframe
ds.set_format(type='pandas')

df = ds['train'][:]

In [34]:
df['hate_speech_score'].describe()

Unnamed: 0,hate_speech_score
count,135556.0
mean,-0.567428
std,2.380003
min,-8.34
25%,-2.33
50%,-0.34
75%,1.41
max,6.3


In [35]:
# create test and training data
df = df[["text", "hate_speech_score"]].dropna().reset_index(drop=True)
df.rename(columns={"hate_speech_score":"labels"}, inplace=True)

df["quantile"] = pd.qcut(
    df["labels"],
    q=10,
    labels=False,
    duplicates="drop"
)

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["quantile"]
)


train_df = train_df.drop(columns=["quantile"])
val_df = val_df.drop(columns=["quantile"])

train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
val_ds = Dataset.from_pandas(val_df[["text", "labels"]])

In [36]:
# tokenise
MODEL_NAME = "microsoft/deberta-v3-small"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
val_ds = val_ds.remove_columns(["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")


config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Map:   0%|          | 0/122000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13556 [00:00<?, ? examples/s]

In [42]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, set_seed
from peft import LoraConfig, get_peft_model, TaskType

encoder = AutoModel.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules = [
    "query_proj",
    "value_proj"
  ] # DeBERTa-v3-small attention names
)

encoder = get_peft_model(encoder, lora_config)

Loading weights:   0%|          | 0/102 [00:00<?, ?it/s]

DebertaV2Model LOAD REPORT from: microsoft/deberta-v3-small
Key                                     | Status     |  | 
----------------------------------------+------------+--+-
mask_predictions.classifier.weight      | UNEXPECTED |  | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED |  | 
mask_predictions.LayerNorm.bias         | UNEXPECTED |  | 
mask_predictions.dense.weight           | UNEXPECTED |  | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED |  | 
mask_predictions.dense.bias             | UNEXPECTED |  | 
mask_predictions.classifier.bias        | UNEXPECTED |  | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED |  | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED |  | 
lm_predictions.lm_head.bias             | UNEXPECTED |  | 
mask_predictions.LayerNorm.weight       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [43]:
class LoRARegressor(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        hidden = self.encoder.config.hidden_size
        self.regressor = nn.Linear(hidden, 1)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        preds = self.regressor(cls).squeeze(-1)

        loss = None
        if labels is not None:
            loss_fn = nn.HuberLoss(delta=1.0)
            loss = loss_fn(preds, labels)

        return {"loss": loss, "logits": preds}

model = LoRARegressor(encoder)

In [48]:
#config
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mae": np.mean(np.abs(preds - labels)),
        "rmse": np.sqrt(np.mean((preds - labels) ** 2)),
        "pearson": pearsonr(preds, labels)[0],
        "spearman": spearmanr(preds, labels)[0],
    }

training_args = TrainingArguments(
  output_dir="./mhs_lora_deberta",
  learning_rate=2e-4,         # higher learning rate works with LoRA
  per_device_train_batch_size=32,
  per_device_eval_batch_size=32,
  num_train_epochs=4,
  logging_steps=100,
  save_strategy="epoch",
  eval_strategy="epoch",
  fp16=True,                   # use GPU mixed precision
  report_to="none",
)



In [None]:
# training

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Mae,Rmse,Pearson,Spearman
1,0.392113,0.419478,0.782634,1.056151,0.911649,0.907459


In [None]:
trainer.save_model("./mhs_lora_deberta/model")
tokenizer.save_pretrained("./mhs_lora_deberta/tokenizer")

In [None]:
def predict(texts):
    model.eval()
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN)
    with torch.no_grad():
        preds = model(**inputs)["logits"]
    return preds.cpu().numpy()