In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
from textattack.attack_recipes import TextFoolerJin2019, PWWSRen2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from textattack import Attacker, AttackArgs
from textattack.datasets import HuggingFaceDataset
import pandas as pd

2025-06-14 23:46:20.853959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749933980.869296 1397280 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749933980.874108 1397280 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749933980.887365 1397280 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749933980.887380 1397280 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749933980.887381 1397280 computation_placer.cc:177] computation placer alr

In [2]:
# os.system("textattack train "
#          "--model bert-base-uncased "
#          "--dataset imdb "
#          "--model-max-length 128 "
#          "--per-device-train-batch-size 32 "
#          "--num-epochs 3 "
#          "--output-dir ./exp")

In [3]:
def get_model_attack(dataset_name, model_type, attack_type):
    # Load IMDb/SST-2 model
    model_name = {
        ("bert", "imdb"): "textattack/bert-base-uncased-imdb",
        ("bert", "sst2"): "textattack/bert-base-uncased-SST-2",
        ("roberta", "imdb"): "textattack/roberta-base-imdb",
        ("roberta", "sst2"): "textattack/roberta-base-SST-2"
    }[(model_type, dataset_name)]
    
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_wrapper = HuggingFaceModelWrapper(model, tokenizer)
    model.eval()
    
    if attack_type == "textfooler":
        attack = TextFoolerJin2019.build(model_wrapper)
    else:
        attack = PWWSRen2019.build(model_wrapper)
    return attack
    

def get_dataset(dataset_name):
    if dataset_name == "imdb":
        return HuggingFaceDataset("imdb", split="test")
    else:
        return HuggingFaceDataset("glue", "sst2", split="validation")
    
def evaluate(dataset_name, model_type, attack_type):
    df_db = pd.read_csv(f"{model_type}_{dataset_name}_{attack_type}_attack.csv")
    success_rate = df_db["result_type"].value_counts(normalize=True).get("Successful", 0.0)
    print(f"{dataset_name} TextFooler Success Rate: {success_rate:.2%}")
    
def evasion_eval(dataset_name, model_type, attack_type):
    attack = get_model_attack(dataset_name, model_type, attack_type)
    db = get_dataset(dataset_name)

    attack_args = AttackArgs(
        num_examples=20,       # Number of samples to attack
        disable_stdout=True,
        log_to_csv=f"{model_type}_{dataset_name}_{attack_type}_attack.csv"
    )

    attacker = Attacker(attack, db, attack_args)
    attacker.attack_dataset()

    evaluate(dataset_name, model_type, attack_type)

In [4]:
def run_all_evasion_attack():
    model_types = ["bert", "roberta"]
    dataset_names = ["imdb", "sst2"]
    attack_types = ["pwws", "textfooler"]

    for model_type in model_types:
        for dataset_name in dataset_names:
            for attack_type in attack_types:
                print(f"\n=== Running: {attack_type.upper()} for {model_type.upper()} on {dataset_name.upper()} ===")
                evasion_eval(dataset_name, model_type, attack_type)

In [5]:
run_all_evasion_attack()


=== Running: PWWS for BERT on IMDB ===


[nltk_data] Downloading package omw-1.4 to /home/sitare/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset [94mimdb[0m, split [94mtest[0m.
textattack: Logging to CSV at path bert_imdb_pwws_attack.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapWordNet
  (constraints): 
    (0): RepeatModification
    (1): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 18 / 0 / 2 / 20: 100%|█| 20/20 [10:21<00:00, 31.08s/


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 18      |
| Number of failed attacks:     | 0       |
| Number of skipped attacks:    | 2       |
| Original accuracy:            | 90.0%   |
| Accuracy under attack:        | 0.0%    |
| Attack success rate:          | 100.0%  |
| Average perturbed word %:     | 6.86%   |
| Average num. words per input: | 203.75  |
| Avg num queries:              | 1457.22 |
+-------------------------------+---------+





imdb TextFooler Success Rate: 90.00%

=== Running: TEXTFOOLER for BERT on IMDB ===


textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset [94mimdb[0m, split [94mtest[0m.
textattack: Logging to CSV at path bert_imdb_textfooler_attack.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

  0%|                                                               | 0/20 [00:00<?, ?it/s]W0000 00:00:1749934631.083080 1397280 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
[Succeeded / Failed / Skipped / Total] 18 / 0 / 2 / 20: 100%|█| 20/20 [05:09<00:00, 15.47s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 18     |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 2      |
| Original accuracy:            | 90.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 11.04% |
| Average num. words per input: | 203.75 |
| Avg num queries:              | 624.67 |
+-------------------------------+--------+





imdb TextFooler Success Rate: 90.00%

=== Running: PWWS for BERT on SST2 ===


[nltk_data] Downloading package omw-1.4 to /home/sitare/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset [94mglue[0m, subset [94msst2[0m, split [94mvalidation[0m.
textattack: Logging to CSV at path bert_sst2_pwws_attack.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapWordNet
  (constraints): 
    (0): RepeatModification
    (1): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 18 / 1 / 1 / 20: 100%|█| 20/20 [00:41<00:00,  2.06s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 18     |
| Number of failed attacks:     | 1      |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 95.0%  |
| Accuracy under attack:        | 5.0%   |
| Attack success rate:          | 94.74% |
| Average perturbed word %:     | 17.81% |
| Average num. words per input: | 15.8   |
| Avg num queries:              | 113.63 |
+-------------------------------+--------+





sst2 TextFooler Success Rate: 90.00%

=== Running: TEXTFOOLER for BERT on SST2 ===


textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset [94mglue[0m, subset [94msst2[0m, split [94mvalidation[0m.
textattack: Logging to CSV at path bert_sst2_textfooler_attack.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 18 / 1 / 1 / 20: 100%|█| 20/20 [00:53<00:00,  2.69s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 18     |
| Number of failed attacks:     | 1      |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 95.0%  |
| Accuracy under attack:        | 5.0%   |
| Attack success rate:          | 94.74% |
| Average perturbed word %:     | 22.69% |
| Average num. words per input: | 15.8   |
| Avg num queries:              | 111.47 |
+-------------------------------+--------+





sst2 TextFooler Success Rate: 90.00%

=== Running: PWWS for ROBERTA on IMDB ===


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package omw-1.4 to /home/sitare/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_fun

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapWordNet
  (constraints): 
    (0): RepeatModification
    (1): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 18 / 0 / 2 / 20: 100%|█| 20/20 [09:52<00:00, 29.63s/


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 18      |
| Number of failed attacks:     | 0       |
| Number of skipped attacks:    | 2       |
| Original accuracy:            | 90.0%   |
| Accuracy under attack:        | 0.0%    |
| Attack success rate:          | 100.0%  |
| Average perturbed word %:     | 5.51%   |
| Average num. words per input: | 203.75  |
| Avg num queries:              | 1410.56 |
+-------------------------------+---------+





imdb TextFooler Success Rate: 90.00%

=== Running: TEXTFOOLER for ROBERTA on IMDB ===


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset 

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 18 / 0 / 2 / 20: 100%|█| 20/20 [03:48<00:00, 11.41s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 18     |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 2      |
| Original accuracy:            | 90.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 7.78%  |
| Average num. words per input: | 203.75 |
| Avg num queries:              | 459.5  |
+-------------------------------+--------+





imdb TextFooler Success Rate: 90.00%

=== Running: PWWS for ROBERTA on SST2 ===


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package omw-1.4 to /home/sitare/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_fu

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  weighted-saliency
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapWordNet
  (constraints): 
    (0): RepeatModification
    (1): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 19 / 1 / 0 / 20: 100%|█| 20/20 [00:40<00:00,  2.02s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 19     |
| Number of failed attacks:     | 1      |
| Number of skipped attacks:    | 0      |
| Original accuracy:            | 100.0% |
| Accuracy under attack:        | 5.0%   |
| Attack success rate:          | 95.0%  |
| Average perturbed word %:     | 16.37% |
| Average num. words per input: | 15.8   |
| Avg num queries:              | 107.7  |
+-------------------------------+--------+





sst2 TextFooler Success Rate: 95.00%

=== Running: TEXTFOOLER for ROBERTA on SST2 ===


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Loading [94mdatasets[0m dataset

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 20 / 0 / 0 / 20: 100%|█| 20/20 [00:49<00:00,  2.49s/


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 20     |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 0      |
| Original accuracy:            | 100.0% |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 22.83% |
| Average num. words per input: | 15.8   |
| Avg num queries:              | 99.45  |
+-------------------------------+--------+
sst2 TextFooler Success Rate: 100.00%



