In [None]:
# !pip install textattack transformers torch

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.attack_recipes import TextFoolerJin2019, DeepWordBugGao2018
from textattack.attack_results import SuccessfulAttackResult

# Download required NLTK data for TextAttack
import nltk
print("Downloading required NLTK data...")
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("✓ All libraries imported and NLTK data downloaded successfully")


  from .autonotebook import tqdm as notebook_tqdm


Downloading required NLTK data...
✓ All libraries imported and NLTK data downloaded successfully


In [None]:
# Load sentiment analysis model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Label mapping for this model (NEGATIVE=0, POSITIVE=1)
labels = {0: "NEGATIVE", 1: "POSITIVE"}

print("✓ Model loaded successfully")
print("Note: Using DistilBERT model which is typically more vulnerable to adversarial attacks")


Loading model: distilbert-base-uncased-finetuned-sst-2-english
✓ Model loaded successfully
Note: Using DistilBERT model which is typically more vulnerable to adversarial attacks


In [3]:
# Define test samples (larger set with clearer sentiment examples)
test_texts = [
    # Positive examples
    "This movie is really good and enjoyable!",
    "I absolutely love this amazing product!",
    "The food here is excellent and delicious.",
    "This book is fantastic and well written.",
    "The service was wonderful and friendly.",
    "I'm so happy with this purchase!",
    "This restaurant serves great food.",
    "The movie was entertaining and fun.",
    "This product works perfectly fine.",
    "I really enjoyed the show.",
    
    # Negative examples  
    "This book is terrible and boring.",
    "The movie was awful and disappointing.",
    "The food tastes bad and cold.",
    "This product is broken and useless.",
    "The service was horrible and slow.",
    "I hate this stupid movie.",
    "This restaurant has terrible food.",
    "The book was boring and dull.",
    "This product is completely worthless.",
    "I'm disappointed with this purchase."
]

print(f"Prepared {len(test_texts)} test samples:")
for i, text in enumerate(test_texts, 1):
    print(f"{i}. {text}")


Prepared 20 test samples:
1. This movie is really good and enjoyable!
2. I absolutely love this amazing product!
3. The food here is excellent and delicious.
4. This book is fantastic and well written.
5. The service was wonderful and friendly.
6. I'm so happy with this purchase!
7. This restaurant serves great food.
8. The movie was entertaining and fun.
9. This product works perfectly fine.
10. I really enjoyed the show.
11. This book is terrible and boring.
12. The movie was awful and disappointing.
13. The food tastes bad and cold.
14. This product is broken and useless.
15. The service was horrible and slow.
16. I hate this stupid movie.
17. This restaurant has terrible food.
18. The book was boring and dull.
19. This product is completely worthless.
20. I'm disappointed with this purchase.


In [4]:
# Setup attacks with modified constraints for better success rate
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder

print("Setting up attacks with relaxed constraints...")

# Create TextFooler attack with relaxed constraints
textfooler_attack = TextFoolerJin2019.build(model_wrapper)

# Relax some constraints to increase success rate
# Remove the sentence encoder constraint which is often too strict
constraints_to_remove = []
for constraint in textfooler_attack.constraints:
    if isinstance(constraint, UniversalSentenceEncoder):
        constraints_to_remove.append(constraint)

for constraint in constraints_to_remove:
    textfooler_attack.constraints.remove(constraint)

print("✓ TextFooler configured with relaxed constraints for better success rate")
print(f"Remaining constraints: {len(textfooler_attack.constraints)}")


Setting up attacks with relaxed constraints...


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


✓ TextFooler configured with relaxed constraints for better success rate
Remaining constraints: 2


In [7]:
# Setup attacks with modified constraints for better success rate
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder
from textattack.constraints.semantics import WordEmbeddingDistance

print("Setting up attacks with relaxed constraints...")

# Create TextFooler attack with relaxed constraints
textfooler_attack = TextFoolerJin2019.build(model_wrapper)

print(f"Original constraints: {len(textfooler_attack.constraints)}")

# Relax constraints to increase success rate
# Remove semantic similarity constraints that are often too strict
constraints_to_remove = []
for constraint in textfooler_attack.constraints:
    # Remove sentence-level semantic constraints
    if isinstance(constraint, UniversalSentenceEncoder):
        constraints_to_remove.append(constraint)
        print("Found UniversalSentenceEncoder constraint - will remove")
    # Relax word embedding distance constraints
    elif isinstance(constraint, WordEmbeddingDistance):
        # Keep it but reduce the threshold
        constraint.min_cos_sim = 0.5  # Default is often 0.8-0.9, reduce to 0.5
        print(f"Relaxed WordEmbeddingDistance threshold to {constraint.min_cos_sim}")

for constraint in constraints_to_remove:
    textfooler_attack.constraints.remove(constraint)

print(f"✓ TextFooler configured with relaxed constraints")
print(f"Remaining constraints: {len(textfooler_attack.constraints)}")
print("✓ Removed overly strict semantic similarity constraints")


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Setting up attacks with relaxed constraints...
Original constraints: 3
Relaxed WordEmbeddingDistance threshold to 0.5
Found UniversalSentenceEncoder constraint - will remove
✓ TextFooler configured with relaxed constraints
Remaining constraints: 2
✓ Removed overly strict semantic similarity constraints


In [8]:
# Helper function to get predictions
def get_prediction(text):
    """Get model prediction for a text"""
    predictions = model_wrapper([text])
    predicted_class = predictions[0].argmax().item()
    confidence = torch.softmax(predictions[0], dim=0).max().item()
    
    return {
        'text': text,
        'prediction': labels[predicted_class],
        'confidence': confidence
    }

# Test original predictions
print("Original predictions:")
for text in test_texts:
    result = get_prediction(text)
    print(f"'{result['text'][:50]}...' → {result['prediction']} ({result['confidence']:.3f})")


Original predictions:
'This movie is really good and enjoyable!...' → POSITIVE (1.000)
'I absolutely love this amazing product!...' → POSITIVE (1.000)
'The food here is excellent and delicious....' → POSITIVE (1.000)
'This book is fantastic and well written....' → POSITIVE (1.000)
'The service was wonderful and friendly....' → POSITIVE (1.000)
'I'm so happy with this purchase!...' → POSITIVE (1.000)
'This restaurant serves great food....' → POSITIVE (1.000)
'The movie was entertaining and fun....' → POSITIVE (1.000)
'This product works perfectly fine....' → POSITIVE (1.000)
'I really enjoyed the show....' → POSITIVE (1.000)
'This book is terrible and boring....' → NEGATIVE (1.000)
'The movie was awful and disappointing....' → NEGATIVE (1.000)
'The food tastes bad and cold....' → NEGATIVE (1.000)
'This product is broken and useless....' → NEGATIVE (1.000)
'The service was horrible and slow....' → NEGATIVE (1.000)
'I hate this stupid movie....' → NEGATIVE (1.000)
'This restaurant has ter

In [9]:
# TextFooler Attack
print("Running TextFooler attacks...")
# textfooler_attack already configured in previous cell with relaxed constraints

textfooler_results = []

for i, text in enumerate(test_texts):
    print(f"\n--- Attack {i+1}/{len(test_texts)} ---")
    original = get_prediction(text)
    print(f"Original: '{text}' → {original['prediction']} ({original['confidence']:.3f})")
    
    try:
        attack_result = textfooler_attack.attack(text, 0)
        
        if isinstance(attack_result, SuccessfulAttackResult):
            adversarial_text = attack_result.perturbed_text()
            adversarial = get_prediction(adversarial_text)
            
            # Check if prediction flipped
            prediction_flipped = original['prediction'] != adversarial['prediction']
            
            print(f"Adversarial: '{adversarial_text}' → {adversarial['prediction']} ({adversarial['confidence']:.3f})")
            
            if prediction_flipped:
                print("✓ SUCCESS - Prediction flipped!")
                
            textfooler_results.append({
                'original': text,
                'adversarial': adversarial_text,
                'original_pred': original['prediction'],
                'adversarial_pred': adversarial['prediction'],
                'success': prediction_flipped
            })
        else:
            print("✗ FAILED")
            textfooler_results.append({
                'original': text,
                'success': False
            })
    except Exception as e:
        print(f"✗ ERROR: {e}")
        textfooler_results.append({
            'original': text,
            'success': False
        })

successful_attacks = sum(1 for r in textfooler_results if r.get('success', False))
print(f"\nTextFooler Results: {successful_attacks}/{len(test_texts)} successful attacks")


Running TextFooler attacks...

--- Attack 1/20 ---
Original: 'This movie is really good and enjoyable!' → POSITIVE (1.000)
✗ FAILED

--- Attack 2/20 ---
Original: 'I absolutely love this amazing product!' → POSITIVE (1.000)
✗ FAILED

--- Attack 3/20 ---
Original: 'The food here is excellent and delicious.' → POSITIVE (1.000)
✗ FAILED

--- Attack 4/20 ---
Original: 'This book is fantastic and well written.' → POSITIVE (1.000)
✗ FAILED

--- Attack 5/20 ---
Original: 'The service was wonderful and friendly.' → POSITIVE (1.000)
✗ FAILED

--- Attack 6/20 ---
Original: 'I'm so happy with this purchase!' → POSITIVE (1.000)
✗ FAILED

--- Attack 7/20 ---
Original: 'This restaurant serves great food.' → POSITIVE (1.000)
✗ FAILED

--- Attack 8/20 ---
Original: 'The movie was entertaining and fun.' → POSITIVE (1.000)
✗ FAILED

--- Attack 9/20 ---
Original: 'This product works perfectly fine.' → POSITIVE (1.000)
✗ FAILED

--- Attack 10/20 ---
Original: 'I really enjoyed the show.' → POSITIVE (1.000

In [10]:
# DeepWordBug Attack
print("Running DeepWordBug attacks...")
deepwordbug_attack = DeepWordBugGao2018.build(model_wrapper)

deepwordbug_results = []

for i, text in enumerate(test_texts):
    print(f"\n--- Attack {i+1}/{len(test_texts)} ---")
    original = get_prediction(text)
    print(f"Original: '{text}' → {original['prediction']} ({original['confidence']:.3f})")
    
    try:
        attack_result = deepwordbug_attack.attack(text, 0)
        
        if isinstance(attack_result, SuccessfulAttackResult):
            adversarial_text = attack_result.perturbed_text()
            adversarial = get_prediction(adversarial_text)
            
            # Check if prediction flipped
            prediction_flipped = original['prediction'] != adversarial['prediction']
            
            print(f"Adversarial: '{adversarial_text}' → {adversarial['prediction']} ({adversarial['confidence']:.3f})")
            
            if prediction_flipped:
                print("✓ SUCCESS - Prediction flipped!")
                
            deepwordbug_results.append({
                'original': text,
                'adversarial': adversarial_text,
                'original_pred': original['prediction'],
                'adversarial_pred': adversarial['prediction'],
                'success': prediction_flipped
            })
        else:
            print("✗ FAILED")
            deepwordbug_results.append({
                'original': text,
                'success': False
            })
    except Exception as e:
        print(f"✗ ERROR: {e}")
        deepwordbug_results.append({
            'original': text,
            'success': False
        })

successful_attacks = sum(1 for r in deepwordbug_results if r.get('success', False))
print(f"\nDeepWordBug Results: {successful_attacks}/{len(test_texts)} successful attacks")


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Running DeepWordBug attacks...

--- Attack 1/20 ---
Original: 'This movie is really good and enjoyable!' → POSITIVE (1.000)
✗ FAILED

--- Attack 2/20 ---
Original: 'I absolutely love this amazing product!' → POSITIVE (1.000)
✗ FAILED

--- Attack 3/20 ---
Original: 'The food here is excellent and delicious.' → POSITIVE (1.000)
✗ FAILED

--- Attack 4/20 ---
Original: 'This book is fantastic and well written.' → POSITIVE (1.000)
✗ FAILED

--- Attack 5/20 ---
Original: 'The service was wonderful and friendly.' → POSITIVE (1.000)
✗ FAILED

--- Attack 6/20 ---
Original: 'I'm so happy with this purchase!' → POSITIVE (1.000)
✗ FAILED

--- Attack 7/20 ---
Original: 'This restaurant serves great food.' → POSITIVE (1.000)
✗ FAILED

--- Attack 8/20 ---
Original: 'The movie was entertaining and fun.' → POSITIVE (1.000)
✗ FAILED

--- Attack 9/20 ---
Original: 'This product works perfectly fine.' → POSITIVE (1.000)
✗ FAILED

--- Attack 10/20 ---
Original: 'I really enjoyed the show.' → POSITIVE (1.00

In [2]:
# # Final Results
# print("=" * 50)
# print("ATTACK RESULTS")
# print("=" * 50)

# # TextFooler successful attacks
# tf_successful = [r for r in textfooler_results if r.get('success', False)]
# print(f"\nTextFooler successful attacks: {len(tf_successful)}/{len(test_texts)}")

# for result in tf_successful:
#     print(f"\nOriginal: '{result['original']}'")
#     print(f"Adversarial: '{result['adversarial']}'")
#     print(f"Prediction: {result['original_pred']} → {result['adversarial_pred']}")

# # DeepWordBug successful attacks  
# dwb_successful = [r for r in deepwordbug_results if r.get('success', False)]
# print(f"\n{'-'*50}")
# print(f"DeepWordBug successful attacks: {len(dwb_successful)}/{len(test_texts)}")

# for result in dwb_successful:
#     print(f"\nOriginal: '{result['original']}'")
#     print(f"Adversarial: '{result['adversarial']}'")
#     print(f"Prediction: {result['original_pred']} → {result['adversarial_pred']}")

# print(f"\n{'='*50}")
# print(f"Total successful attacks: {len(tf_successful) + len(dwb_successful)}")
# print("=" * 50)
