In [1]:
# !pip install transformers textattack textblob
# !python -m textblob.download_corpora

In [8]:
from textblob import TextBlob
from transformers import pipeline

# Spelling correction
def correct_spelling(text):
    return str(TextBlob(text).correct())

# Language model purification (e.g., FLAN-T5)
generator = pipeline("text2text-generation", model="google/flan-t5-small")

def purify_text(text):
    prompt = f"Correct and rewrite the sentence: {text}"
    result = generator(prompt, max_length=100, do_sample=False)
    return result[0]['generated_text']

# Combine both
def purify_pipeline(text):
    corrected = correct_spelling(text)
    purified = purify_text(corrected)
    return purified




In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tabulate import tabulate

def evaluate(dataset_name, model_type, attack_type):
    df = pd.read_csv(f"{model_type}_{dataset_name}_{attack_type}_attack.csv")
    purified_csv = f"{model_type}_{dataset_name}_{attack_type}_defense.csv"
    
    adv_texts = []
    purified_texts = []
    pred_labels = []
    true_labels = []
    recovery_flags = []

    model_name = {
        ("bert", "imdb"): "textattack/bert-base-uncased-imdb",
        ("bert", "sst2"): "textattack/bert-base-uncased-SST-2",
        ("roberta", "imdb"): "textattack/roberta-base-imdb",
        ("roberta", "sst2"): "textattack/roberta-base-SST-2"
    }[(model_type, dataset_name)]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    total = len(df)
    successful_attacks = df["result_type"].value_counts().get("Successful", 0)

    recovered = 0
    for _, row in df.iterrows():
        if row["result_type"] != "Successful":
            continue

        adv_text = row["perturbed_text"]
        true_label = row["ground_truth_output"]

        purified_text = purify_pipeline(adv_text)

        inputs = tokenizer(purified_text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            pred_label = logits.argmax(dim=-1).item()

        if str(pred_label) == str(true_label):
            recovered += 1
        
        adv_texts.append(adv_text)
        purified_texts.append(purified_text)
        pred_labels.append(pred_label)
        true_labels.append(true_label)
        recovery_flags.append(str(pred_label) == str(true_label))
        
    purified_df = pd.DataFrame({
                    "adv_text": adv_texts,
                    "purified_text": purified_texts,
                    "predicted_label": pred_labels,
                    "true_label": true_labels,
                    "recovered": recovery_flags
                })
    
    purified_df.to_csv(purified_csv, index=False)

    recovery_rate = recovered / successful_attacks if successful_attacks > 0 else 0.0
    attack_success_rate = successful_attacks / total if total > 0 else 0.0
    post_attack_accuracy = 1 - attack_success_rate
    post_purification_accuracy = (total - successful_attacks + recovered) / total

    table = [
        ["Total Attacked Examples", total],
        ["Successful Adversarial Attacks", successful_attacks],
        ["Recovered After Purification", recovered],
        ["Model Accuracy After Attack", f"{post_attack_accuracy:.2%}"],
        ["Model Accuracy After Purification", f"{post_purification_accuracy:.2%}"],
        ["Purification Recovery Rate", f"{recovery_rate:.2%}"]
    ]

    print(tabulate(table, headers=["Metric", "Value"], tablefmt="grid"))


In [16]:
def run_all_evasion_and_purification():
    model_types = ["bert", "roberta"]
    dataset_names = ["imdb", "sst2"]
    attack_types = ["pwws", "textfooler"]

    for model_type in model_types:
        for dataset_name in dataset_names:
            for attack_type in attack_types:
                print(f"\n=== Running: {attack_type.upper()} for {model_type.upper()} on {dataset_name.upper()} ===")
                evaluate(dataset_name, model_type, attack_type)
            


In [17]:
run_all_evasion_and_purification()


=== Running: PWWS for BERT on IMDB ===




+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 13      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 75.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 72.22%  |
+-----------------------------------+---------+

=== Running: TEXTFOOLER for BERT on IMDB ===




+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 12      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 70.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 66.67%  |
+-----------------------------------+---------+

=== Running: PWWS for BERT on SST2 ===




+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 11      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 65.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 61.11%  |
+-----------------------------------+---------+

=== Running: TEXTFOOLER for BERT on SST2 ===




+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 12      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 70.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 66.67%  |
+-----------------------------------+---------+

=== Running: PWWS for ROBERTA on IMDB ===


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 8       |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 50.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 44.44%  |
+-----------------------------------+---------+

=== Running: TEXTFOOLER for ROBERTA on IMDB ===


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 18      |
+-----------------------------------+---------+
| Recovered After Purification      | 7       |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 10.00%  |
+-----------------------------------+---------+
| Model Accuracy After Purification | 45.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 38.89%  |
+-----------------------------------+---------+

=== Running: PWWS for ROBERTA on SST2 ===


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 19      |
+-----------------------------------+---------+
| Recovered After Purification      | 14      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 5.00%   |
+-----------------------------------+---------+
| Model Accuracy After Purification | 75.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 73.68%  |
+-----------------------------------+---------+

=== Running: TEXTFOOLER for ROBERTA on SST2 ===


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


+-----------------------------------+---------+
| Metric                            | Value   |
| Total Attacked Examples           | 20      |
+-----------------------------------+---------+
| Successful Adversarial Attacks    | 20      |
+-----------------------------------+---------+
| Recovered After Purification      | 11      |
+-----------------------------------+---------+
| Model Accuracy After Attack       | 0.00%   |
+-----------------------------------+---------+
| Model Accuracy After Purification | 55.00%  |
+-----------------------------------+---------+
| Purification Recovery Rate        | 55.00%  |
+-----------------------------------+---------+
