In [None]:
import jailbreakbench as jbb
import os
import pandas as pd

import numpy as np
from sklearn.svm import SVC
from scipy.special import softmax

from SPD.main_spd import calculate_logits

os.environ["CUDA_VISIBLE_DEVICES"]="0"

### 1. Save the logits
For each dataset, calculate and save logit values in seperate folders. Hyperparameters such as temperature, top-p, top-k can be modified by altering the SPD/config_spd.py file.

In [None]:
# Location to save the data
write_path = "logits/gcg/test"

# Load the data
model_name = "llama-2-7b-chat-hf"
data_path = "data/Llama2/GCG.json" 
data = pd.read_json(path_or_buf=data_path)[model_name]

ind_to_read = [*range(10)]
prompts = {}
for i in range(len(ind_to_read)):
    prompts[i] = data[ind_to_read[i]]["prompt"]

all_prompts = {model_name: prompts}
    
# Saves the logit values in write_path/model_name
calculate_logits(write_path, all_prompts, llm_provider="vllm")

### 2. Load training data

Load the saved logit values and prepare the feature vector. While $r$ determines the number of token locations, $k$ determines how many candidates are calculated per each location.

In [None]:
model_name = "llama-2-7b-chat-hf"

train_size = 100
r = 5   # Number of token places to consider
k = 25  # Number of candidate logit values to consider

# Locations of the train data
read_paths_train = ["logits/qnli/train",
                    "logits/alpaca/train"
                    "logits/gcg/train",
                    "logits/autodan/train"]

logit_values_train = []

for path in read_paths_train:
    logits = np.load(os.path.join(path, f"{model_name}.npy"))
    probabilities = softmax(logits, axis=2)
    values = - np.log(probabilities)
    logit_values_train.append(values[:train_size,:r,:k].reshape(train_size, r * k))

logit_values_train = np.array(logit_values_train)
benign_indexes = [0,1]
attack_indexes = [2,3]

train_benign   = np.concatenate((logit_values_train[benign_indexes]))
train_attack = np.concatenate((logit_values_train[attack_indexes]))
            
train_x = np.concatenate((train_benign,train_attack))
train_y = np.concatenate((np.zeros(train_benign.shape[0]),np.ones(train_attack.shape[0])))    

### 3. Train the classifier

In [None]:
# Train the classifier 
clf = SVC(kernel="rbf",class_weight="balanced")
clf.fit(train_x, train_y) 

### 4. Load the test data

Load the saved logit values and prepare the feature vector. $r$ and $k$ values should remain the same with training data.

In [None]:
# Locations of the test data
read_paths_test = ["logits/qnli/test",
                    "logits/alpaca/test"
                    "logits/gcg/test",
                    "logits/autodan/test"]
test_size = 100

logit_values_test = []

for path in range(len(read_paths_test)):
    logits = np.load(os.path.join(path, f"{model_name}.npy"))
    probabilities = softmax(logits, axis=2)
    values = - np.log(probabilities)
    logit_values_test.append(values[:test_size,:r,:k].reshape(test_size, r * k))

logit_values_test = np.array(logit_values_test)
benign_indexes = [0,1]
attack_indexes = [2,3]

test_benign   = np.concatenate((logit_values_test[benign_indexes]))
test_attack = np.concatenate((logit_values_test[attack_indexes]))

### 5. Test the classifier

Get the TP and FP rates of the test data.

In [None]:
pred_benign = clf.predict(test_benign)        
pred_attack = clf.predict(test_attack)        

print('TP rate:', sum(pred_attack)/len(pred_attack) * 100)
print('FP rate:', sum(pred_benign)/len(pred_benign) * 100)