In [4]:
%pip install textattack


Note: you may need to restart the kernel to use updated packages.


In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/main/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import promptbench as pb
from promptbench.models import LLMModel
from promptbench.prompt_attack import Attack

In [3]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.2.0
False


In [4]:
model_t5 = LLMModel(model='google/flan-t5-large', device = 'cpu', temperature=0.7, do_sample=True)


# create dataset
dataset = pb.DatasetLoader.load_dataset("sst2")

# try part of the dataset
dataset = dataset[:10]

# create prompt
prompt = "As a sentiment classifier, determine whether the following text is 'positive' or 'negative'. Please classify: \nQuestion: {content}\nAnswer:"

# define the projection function required by the output process
def proj_func(pred):
    mapping = {
        "positive": 1,
        "negative": 0
    }
    return mapping.get(pred, -1)

# define the evaluation function required by the attack
# if the prompt does not require any dataset, for example, "write a poem", you still need to include the dataset parameter
def eval_func(prompt, dataset, model):
    preds = []
    labels = []
    for d in dataset:
        input_text = pb.InputProcess.basic_format(prompt, d)
        raw_output = model(input_text)

        output = pb.OutputProcess.cls(raw_output, proj_func)
        preds.append(output)

        labels.append(d["label"])
    
    return pb.Eval.compute_cls_accuracy(preds, labels)
    
# define the unmodifiable words in the prompt
# for example, the labels "positive" and "negative" are unmodifiable, and "content" is modifiable because it is a placeholder
# if your labels are enclosed with '', you need to add \' to the unmodifiable words (due to one feature of textattack)
unmodifiable_words = ["positive\'", "negative\'", "content"]

# print all supported attacks
print(Attack.attack_list())

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['textbugger', 'deepwordbug', 'textfooler', 'bertattack', 'checklist', 'stresstest', 'semantic']


In [5]:
attack = Attack(model_t5, "stresstest", dataset, prompt, eval_func, unmodifiable_words, verbose=True)

# print attack result
print(attack.attack())

These words (if they appear in the prompt) are not allowed to be attacked:
["positive'", "negative'", 'content']
--------------------------------------------------
Current prompt is:  As a sentiment classifier, determine whether the following text is 'positive' or 'negative'. Please classify: 
Question: {content}
Answer:
Current accuracy is:  1.0
--------------------------------------------------

--------------------------------------------------
Modifiable words:  ['As', 'a', 'sentiment', 'classifier', 'determine', 'whether', 'the', 'following', 'text', 'is', 'or', 'Please', 'classify', 'Question', 'Answer']
--------------------------------------------------

--------------------------------------------------
Current prompt is:  As a sentiment classifier, determine whether the following text is 'positive' or 'negative'. Please classify: 
Question: {content}
Answer  and false is not true :
Current accuracy is:  1.0
--------------------------------------------------

------------------