Dataset: `jackhhao/jailbreak-classification`

Model: `distilbert/distilbert-base-uncased`


Preparing Requirements

In [1]:
%%writefile requirements.txt
torch
pytorch-ignite
transformers
datasets
evaluate
accelerate

Writing requirements.txt


In [2]:
! pip install -r requirements.txt

Collecting pytorch-ignite (from -r requirements.txt (line 2))
  Downloading pytorch_ignite-0.5.1-py3-none-any.whl.metadata (27 kB)
Collecting datasets (from -r requirements.txt (line 4))
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate (from -r requirements.txt (line 5))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 4))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 4))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->-r requirements.txt (line 4))
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py

Datasets

In [3]:
from datasets import load_dataset

dataset = load_dataset("jackhhao/jailbreak-classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/988 [00:00<?, ?B/s]

(…)ced/jailbreak_dataset_train_balanced.csv:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

(…)nced/jailbreak_dataset_test_balanced.csv:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1044 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/262 [00:00<?, ? examples/s]

In [4]:
new_split = dataset['train'].train_test_split(test_size=0.1)
dataset['train'] = new_split['train']
dataset['validation'] = new_split['test']

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'type'],
        num_rows: 939
    })
    test: Dataset({
        features: ['prompt', 'type'],
        num_rows: 262
    })
    validation: Dataset({
        features: ['prompt', 'type'],
        num_rows: 105
    })
})

In [6]:
labels = set(dataset['train']['type'])
labels

{'benign', 'jailbreak'}

In [7]:
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}
print(label2id)
print(id2label)

{'benign': 0, 'jailbreak': 1}
{0: 'benign', 1: 'jailbreak'}


Preprocessing

In [8]:
# tokenizer
from transformers import AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, padding=True)

tokenized_data = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [10]:
def lable_function(examples):
    examples['labels'] = [label2id[label] for label in examples['type']]
    return examples

tokenized_data = tokenized_data.map(lable_function, batched=True)


Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [11]:
tokenized_data['train'][0]['labels'], tokenized_data['train'][0]['type']

(1, 'jailbreak')

In [12]:
trainable_data = tokenized_data.remove_columns(['prompt', 'type'])
trainable_data.set_format('torch')

In [13]:
train_dataset = trainable_data['train']
eval_dataset = trainable_data['validation']
# Dataloader

from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

In [14]:
next(iter(train_dataloader))

{'input_ids': tensor([[  101,  9699,  1037,  ...,     0,     0,     0],
         [  101,  7592,  1010,  ...,     0,     0,     0],
         [  101, 20996, 28114,  ...,     0,     0,     0],
         ...,
         [  101,  3531,  3437,  ...,     0,     0,     0],
         [  101,  9811,  2000,  ...,     0,     0,     0],
         [  101,  2198,  2001,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1, 0, 1, 1, 0, 0, 0])}

In [15]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from torch.optim import AdamW
import torch
optimizer = AdamW(model.parameters(), lr=5e-5)

from transformers import get_linear_schedule_with_warmup

epochs = 5

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=len(train_dataloader) * epochs
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [19]:
for epoch in range(epochs):
    for batch_idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        if batch_idx % 100 == 0:
          print(f"Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}")
    print(f"Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}")


Epoch: 0, Batch: 0, Loss: 0.01661505550146103
Epoch: 0, Batch: 100, Loss: 0.010079759173095226
Epoch: 0, Batch: 117, Loss: 0.014704465866088867
Epoch: 1, Batch: 0, Loss: 0.007388838566839695
Epoch: 1, Batch: 100, Loss: 0.006234216503798962
Epoch: 1, Batch: 117, Loss: 0.00402008555829525
Epoch: 2, Batch: 0, Loss: 0.0007825899519957602
Epoch: 2, Batch: 100, Loss: 0.008465833030641079
Epoch: 2, Batch: 117, Loss: 0.0019398871809244156
Epoch: 3, Batch: 0, Loss: 0.002364306477829814
Epoch: 3, Batch: 100, Loss: 0.0005951715284027159
Epoch: 3, Batch: 117, Loss: 0.0008909486350603402
Epoch: 4, Batch: 0, Loss: 0.0005019617965444922
Epoch: 4, Batch: 100, Loss: 0.000442863063653931
Epoch: 4, Batch: 117, Loss: 0.0003750270407181233


In [20]:
model.save_pretrained('distilbert-base-uncased_jailbreaker_classification')

In [21]:
! pip install huggingface_hub



In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
model.push_to_hub("distilbert-base-uncased_jailbreaker_classification")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PradhyumnaPoralla/distilbert-base-uncased_jailbreaker_classification/commit/fa368cff46ac26800278111a4c302ef9247e2443', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='fa368cff46ac26800278111a4c302ef9247e2443', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
from transformers import DistilBertForSequenceClassification

model_name = 'PradhyumnaPoralla/distilbert-base-uncased_jailbreaker_classification'

model = DistilBertForSequenceClassification.from_pretrained(model_name)


config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [24]:
for batch in eval_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}

  outputs = model(**batch)


  accuracy = sum(batch['labels']==outputs.logits.argmax(-1))/len(batch['labels'])
  print(accuracy)



tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')


In [27]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 105
})

In [32]:
test_data = tokenized_data['test'].remove_columns(['prompt', 'type'])
test_data.set_format('torch')
test_data = DataLoader(test_data, batch_size=8)

for batch in test_data:
  batch = {k: v.to(device) for k, v in batch.items()}

  outputs = model(**batch)


  accuracy = sum(batch['labels']==outputs.logits.argmax(-1))/len(batch['labels'])
  print(accuracy)

tensor(0.8750, device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(0.8750, device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(0.8750, device='cuda:0')
tensor(1., device='cuda:0')
tensor(0.8750, device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
