In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    set_seed,
)

from scipy.stats import pearsonr, spearmanr

In [3]:
from datasets import load_dataset

# load data
ds = load_dataset("ucberkeley-dlab/measuring-hate-speech")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/135556 [00:00<?, ? examples/s]

In [4]:
# get as pandas dataframe
ds.set_format(type='pandas')

df = ds['train'][:]

In [5]:
df['hate_speech_score'].describe()

Unnamed: 0,hate_speech_score
count,135556.0
mean,-0.567428
std,2.380003
min,-8.34
25%,-2.33
50%,-0.34
75%,1.41
max,6.3


In [6]:
# create test and training data
df = df[["text", "hate_speech_score"]].dropna().reset_index(drop=True)
df.rename(columns={"hate_speech_score":"labels"}, inplace=True)

df["quantile"] = pd.qcut(
    df["labels"],
    q=10,
    labels=False,
    duplicates="drop"
)

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["quantile"]
)


train_df = train_df.drop(columns=["quantile"])
val_df = val_df.drop(columns=["quantile"])

train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
val_ds = Dataset.from_pandas(val_df[["text", "labels"]])

In [7]:
# tokenise
MODEL_NAME = "microsoft/deberta-v3-small"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
val_ds = val_ds.remove_columns(["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")


config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/122000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13556 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, set_seed
from peft import LoraConfig, get_peft_model, TaskType

encoder = AutoModel.from_pretrained(MODEL_NAME)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules = [
    "query_proj",
    "value_proj"
  ] # DeBERTa-v3-small attention names
)

encoder = get_peft_model(encoder, lora_config)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/102 [00:00<?, ?it/s]

DebertaV2Model LOAD REPORT from: microsoft/deberta-v3-small
Key                                     | Status     |  | 
----------------------------------------+------------+--+-
lm_predictions.lm_head.dense.bias       | UNEXPECTED |  | 
mask_predictions.classifier.bias        | UNEXPECTED |  | 
mask_predictions.dense.bias             | UNEXPECTED |  | 
mask_predictions.classifier.weight      | UNEXPECTED |  | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED |  | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED |  | 
lm_predictions.lm_head.bias             | UNEXPECTED |  | 
mask_predictions.LayerNorm.weight       | UNEXPECTED |  | 
mask_predictions.LayerNorm.bias         | UNEXPECTED |  | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED |  | 
mask_predictions.dense.weight           | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

In [9]:
class LoRARegressor(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        hidden = self.encoder.config.hidden_size
        self.regressor = nn.Linear(hidden, 1)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        preds = self.regressor(cls).squeeze(-1)

        loss = None
        if labels is not None:
            loss_fn = nn.HuberLoss(delta=1.0)
            loss = loss_fn(preds, labels)

        return {"loss": loss, "logits": preds}

model = LoRARegressor(encoder)

In [10]:
#config
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mae": np.mean(np.abs(preds - labels)),
        "rmse": np.sqrt(np.mean((preds - labels) ** 2)),
        "pearson": pearsonr(preds, labels)[0],
        "spearman": spearmanr(preds, labels)[0],
    }

training_args = TrainingArguments(
  output_dir="./mhs_lora_deberta",
  learning_rate=2e-4,         # higher learning rate works with LoRA
  per_device_train_batch_size=32,
  per_device_eval_batch_size=32,
  num_train_epochs=4,
  logging_steps=100,
  save_strategy="epoch",
  eval_strategy="epoch",
  fp16=True,                   # use GPU mixed precision
  report_to="none",
)



In [11]:
def predict(texts):
    model.eval()
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN)
    with torch.no_grad():
        preds = model(**inputs)["logits"]
    return preds.cpu().numpy()

In [None]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from torch.utils.data import DataLoader
from mapie.regression import SplitConformalRegressor

# =========================
# Sklearn-style LoRA wrapper
# =========================
class SklearnLoRARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model, device="cuda", batch_size=32, lr=1e-4, epochs=3):
        self.model = model
        self.device = device
        self.batch_size = batch_size
        self.lr = lr
        self.epochs = epochs

        self.model.to(self.device)
        # Ensure encoder + head are same dtype
        self.model.encoder = self.model.encoder.to(torch.float16)
        self.model.regressor = self.model.regressor.to(torch.float16)

    def fit(self, dataset, labels=None):
        self.model.train()
        optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr
        )

        dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for batch in loader:
            input_ids = batch["input_ids"].to(self.device).long()        # embeddings must be long
            attention_mask = batch["attention_mask"].to(self.device).to(torch.float16)
            labels = batch["labels"].to(self.device).to(torch.float16)

            out = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out["loss"]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        return self

    def predict(self, dataset):
        self.model.eval()
        preds = []

        dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

        with torch.no_grad():
            for batch in loader:
                input_ids = batch["input_ids"].to(self.device).long()
                attention_mask = batch["attention_mask"].to(self.device).to(torch.float16)

                out = self.model(input_ids=input_ids, attention_mask=attention_mask)
                preds.append(out["logits"].cpu().numpy())

        return np.concatenate(preds, axis=0)


# =========================
# Freeze encoder (LoRA best practice)
# =========================
for p in model.encoder.parameters():
    p.requires_grad = False

# =========================
# Train LoRA on train_ds
# =========================
sk_model = SklearnLoRARegressor(model=model, epochs=3)
sk_model.fit(train_ds)

# =========================
# SplitConformalRegressor (MAPIE 1.3.0)
# =========================
mapie = SplitConformalRegressor(estimator=sk_model)
mapie.fit(val_ds, val_ds["labels"])  # val_ds used for calibration

# =========================
# Predict on val_ds with intervals
# =========================
y_pred, y_pis = mapie.predict(val_ds, alpha=0.1)  # 90% intervals

lower = y_pis[:, 0, 0]
upper = y_pis[:, 1, 0]

print("Predictions shape:", y_pred.shape)
print("Intervals shape:", y_pis.shape)
