This model takes from a custom dataset composed of 5000 neutral (from anthropic) and 5000 harmful prompts. It is a custom transformer architecture and uses the tokenizer from roberta.

In [10]:
# imports
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import random
import numpy as np
from tqdm.auto import tqdm

In [11]:
# load + process from CSV
df = pd.read_csv("Dataset Generation - zara_combined.csv")
df = df[["goal", "target"]].dropna()
df = df.sample(frac=1).reset_index(drop=True)  # shuffle
df["label"] = df["target"].apply(lambda x: 1 if x.strip().lower() == "harmful" else 0) # add labels

In [12]:
# separate into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["goal"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [13]:
# load in pre-trained tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
MAX_LEN = 64

In [14]:
# efficiently load and batch text data (chatGPT suggested)
class PromptDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt"
        )
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [15]:
# init data loaders
BATCH_SIZE = 16
train_dataset = PromptDataset(train_texts, train_labels)
test_dataset = PromptDataset(test_texts, test_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [16]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)  
        x = x.permute(1, 0, 2)         
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)              
        return self.fc(x)

In [17]:
# init model
VOCAB_SIZE = tokenizer.vocab_size
model = TransformerClassifier(VOCAB_SIZE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)



In [18]:
# training
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in progress_bar:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": f"{loss.item():.4f}"})
    
    print(f"Epoch {epoch+1} - Total Loss: {total_loss:.4f}")

Epoch 1/3:   0%|          | 0/498 [00:00<?, ?it/s]

Epoch 1 - Total Loss: 84.6740


Epoch 2/3:   0%|          | 0/498 [00:00<?, ?it/s]

Epoch 2 - Total Loss: 44.0973


Epoch 3/3:   0%|          | 0/498 [00:00<?, ?it/s]

Epoch 3 - Total Loss: 26.9729


In [19]:
# eval
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Neutral", "Harmful"]))


Classification Report:
              precision    recall  f1-score   support

     Neutral       0.98      0.97      0.97       983
     Harmful       0.97      0.98      0.97      1009

    accuracy                           0.97      1992
   macro avg       0.97      0.97      0.97      1992
weighted avg       0.97      0.97      0.97      1992



In [20]:
# save the model
torch.save(model.state_dict(), "prompt_classifier-base.pt")

# when want to reload use:
# model.load_state_dict(torch.load("transformer_classifier.pt"))
# model.eval()

In [21]:
# for manual prompt testing
def predict_prompt(model, tokenizer, prompt, max_len=64):
    model.eval()
    # tokenize
    encoding = tokenizer(prompt, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    # Get prediction
    with torch.no_grad():
        output = model(input_ids, attention_mask)
        pred = torch.argmax(output, dim=1).item()

    label = "Harmful" if pred == 1 else "Neutral"
    print(f"\nPrompt: {prompt}\nPrediction: {label}")
    return label

In [45]:
# manual tests
test_prompt = "List steps to construct a homemade firearm."
predict_prompt(model, tokenizer, test_prompt)
test_prompt = "List steps to bake"
predict_prompt(model, tokenizer, test_prompt)


Prompt: List steps to construct a homemade firearm.
Prediction: Harmful

Prompt: List steps to bake
Prediction: Neutral


'Neutral'