Imports

In [46]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import torch

In [47]:
# set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device {device}")

using device cpu


In [59]:
# load dataset
dataset = load_dataset("civil_comments", split="train[:5000]")

In [60]:
# Examine the dataset and look at examples and columns
import random
random_idxs = random.sample(range(len(dataset)), 5)
print(f"dataset shape: {dataset.shape}")
print(f"column names: {dataset.column_names}")
random_samples = dataset.select(random_idxs)
# for sample in random_samples:
#     print(sample['text'], "\n", sample['toxicity'], "\n---\n")

dataset shape: (5000, 8)
column names: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']


```batched=True``` does the mapping in batches, hence is faster

In [61]:
# preprocess
def preprocess(batch):
    return {
        "text": batch["text"],
        "label": [int(t > 0.5) for t in batch["toxicity"]]
    }

dataset = dataset.map(
    preprocess, 
    batched=True,
    load_from_cache_file=True, 
    desc="Processing dataset"
)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["text", "label"]])
split = dataset.train_test_split(test_size=0.2, seed = 32)
train_dataset = split["train"]
test_dataset = split["test"]

In [63]:
# Tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

padding is required when batching data

In [64]:
# Tokenize in batches with truncation and padding
def tokenize_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
    desc="Tokenizing train data"
)
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
    desc="Tokenizing test data"
)

Tokenizing test data: 100%|██████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 19533.10 examples/s]


attention_mask tells the model itself during training or inference which parts of the input are real tokens (1) and which are padding (0)


In [65]:
# Set format for PyTorch
torch_columns = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=torch_columns)
test_dataset.set_format(type="torch", columns=torch_columns)