Imports

In [29]:
# import os
# os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
# os.environ["PYTORCH_MPS_FALLBACK"] = "1"

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"using device {device}")

using device cpu


In [3]:
# load dataset
dataset = load_dataset("civil_comments", split="train[:5000]")

In [4]:
# Examine the dataset and look at examples and columns
import random
random_idxs = random.sample(range(len(dataset)), 3)
print(f"dataset shape: {dataset.shape}")
print(f"column names: {dataset.column_names}")
random_samples = dataset.select(random_idxs)
for sample in random_samples:
    print(sample['text'], "\n", sample['toxicity'], "\n---\n")

dataset shape: (5000, 8)
column names: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
It's almost amusing that you characterize Law Enforcement's approach to the Bundy militia's armed occupation as "confrontational."  These folks worked extremely patiently for six week to achieve a peaceful surrender.  Negotiations with armed occupiers seldom includes offering them a "King's X" for the crimes they have committed. You speak of the "hard line".  Do you propose that seeking a peaceful surrender and having the armed occupiers face the legal consequences of their actions constitutes a "hard line."? 
 0.0 
---

When the moon hits your eye like a bigga pizza pie, that's amore'. 
 0.20000000298023224 
---

Yep, Obama and Bush W. gave us Trump.  We do indeed need "toughness", NOW! 
 0.30000001192092896 
---



```batched=True``` does the mapping in batches, hence is faster

In [5]:
# preprocess
def preprocess(batch):
    return {
        "text": batch["text"],
        "label": [int(t > 0.5) for t in batch["toxicity"]]
    }

dataset = dataset.map(
    preprocess, 
    batched=True,
    load_from_cache_file=True, 
    desc="Processing dataset"
)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["text", "label"]])
split = dataset.train_test_split(test_size=0.2, seed = 32)
train_dataset = split["train"]
test_dataset = split["test"]

In [6]:
# Tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

padding is required when batching data

In [7]:
# Tokenize in batches with truncation and padding
def tokenize_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
    desc="Tokenizing train data"
)
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
    desc="Tokenizing test data"
)

attention_mask tells the model itself during training or inference which parts of the input are real tokens (1) and which are padding (0)


In [8]:
# Set format for PyTorch
torch_columns = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=torch_columns)
test_dataset.set_format(type="torch", columns=torch_columns)

LoRAConfig Parameters
--task_type: Tells the LoRA setup what kind of task you're working on. TaskType.SEQ_CLS = sequence classification (Other types include CAUSAL_LM, SEQ_2_SEQ_LM, etc.)

--target_modules: Refer to common linear layers in transformer attention mechanisms where LoRA is usually applied. 

--r: Rank of the low-rank adapters.

--lora_alpha: A scaling factor applied to the adapter output, like a learning rate multiplier for the adapter layers.

--lora_dropout: Applies dropout only to the LoRA adapter layers to prevent overfitting when fine-tuning on small datasets.

--inference_mode: If True, disables adapter training (useful for inference-only scenarios). Set to False during training, so LoRA layers are trained.



In [9]:
# load base model and fine tune with LoRA
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_lin", "v_lin"],  # common in BERT-like models
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    inference_mode=False
)
model = get_peft_model(base_model, peft_config)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

--output_dir: Where to save model checkpoints and logs.

--save_strategy="epoch": Save model checkpoint at the end of each epoch.

--logging_dir: Directory to store TensorBoard logs.

--per_device_train_batch_size=16: Batch size for training per device (CPU/GPU).

--per_device_eval_batch_size=16: Batch size for evaluation per device.

--num_train_epochs=3: Number of full passes through the training data.

--learning_rate=2e-5: Initial learning rate for optimizer.

--report_to="none": Disable automatic logging to platforms like W&B or TensorBoard.

--logging_steps=100: Log training progress every 100 steps.


In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./outputs",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    report_to="none",
    logging_steps=100,
    no_cuda=True
)



In [11]:
# Metric for evaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [12]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,0.2986
200,0.1575
300,0.1472
400,0.1521
500,0.1309
600,0.1521
700,0.1535


TrainOutput(global_step=750, training_loss=0.16571710332234701, metrics={'train_runtime': 509.509, 'train_samples_per_second': 23.552, 'train_steps_per_second': 1.472, 'total_flos': 404218220544000.0, 'train_loss': 0.16571710332234701, 'epoch': 3.0})

In [13]:
# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["label"]  # pass as `labels`, not `label`
        )
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == batch["label"]).sum().item()
        total += batch["label"].size(0)

accuracy = correct / total
print(f"\nCustom Evaluation Accuracy: {accuracy:.4f}")


Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████| 63/63 [00:13<00:00,  4.69it/s]


Custom Evaluation Accuracy: 0.9710





In [15]:
model.save_pretrained("./outputs")
tokenizer.save_pretrained("./outputs")

('./outputs/tokenizer_config.json',
 './outputs/special_tokens_map.json',
 './outputs/vocab.txt',
 './outputs/added_tokens.json',
 './outputs/tokenizer.json')