In [1]:
!pip install -q textattack==0.3.10

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
revision = "714eb0fa89d2f80546fda750413ed43d93601a13"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax
import torch

# Load the model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_name, revision=revision)
model = DistilBertForSequenceClassification.from_pretrained(model_name, revision=revision).to(device)

# Set the model to evaluation mode
model.eval()

def predict_sentiment(text):
    # Tokenize the input text and prepare it as PyTorch tensors
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move tensors to the appropriate device (CPU or GPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Disable gradient calculation to save memory and speed up inference
    with torch.no_grad():
        outputs = model(**inputs)  # Perform a forward pass
        predictions = softmax(outputs.logits, dim=1)  # Convert logits to probabilities

    # Extract probabilities for each class (index 0 = negative, 1 = positive)
    negative_prob = predictions[0][0].item()
    positive_prob = predictions[0][1].item()

    # Determine the sentiment label based on the higher probability
    label = "POSITIVE" if positive_prob > negative_prob else "NEGATIVE"
    confidence = positive_prob if label == "POSITIVE" else negative_prob

    # Return the result as a dictionary
    return {
        "label": label,
        "confidence": confidence,
        "probabilities": {
            "negative": negative_prob,
            "positive": positive_prob
        }
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
import sys

# Try to detect if running in Google Colab
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_colab():
    from IPython.display import display
    import ipywidgets as widgets

    def on_submit(b):
        text = text_input.value
        result = predict_sentiment(text)
        print(f"\nText: {text}")
        print(f"Sentiment: {result['label']}")
        print(f"Confidence: {result['confidence']:.4f}")
        print(f"Negative probability: {result['probabilities']['negative']:.4f}")
        print(f"Positive probability: {result['probabilities']['positive']:.4f}")

    text_input = widgets.Textarea(
        value="This movie is fantastic!",
        placeholder='Enter a sentence...',
        description='Text:',
        layout=widgets.Layout(width='100%', height='80px')
    )

    button = widgets.Button(description="Predict Sentiment")
    button.on_click(on_submit)

    display(text_input, button)
else:
    print("Interactive UI is only available in Google Colab.")


Textarea(value='This movie is fantastic!', description='Text:', layout=Layout(height='80px', width='100%'), pl…

Button(description='Predict Sentiment', style=ButtonStyle())


Text: This movie is fantastic!
Sentiment: POSITIVE
Confidence: 0.9999
Negative probability: 0.0001
Positive probability: 0.9999

Text: This movie is fantastic!
Sentiment: POSITIVE
Confidence: 0.9999
Negative probability: 0.0001
Positive probability: 0.9999

Text: This movie is fantastic!
Sentiment: POSITIVE
Confidence: 0.9999
Negative probability: 0.0001
Positive probability: 0.9999


In [6]:
# Import the model wrapper interface and attack tools provided by TextAttack
from textattack.models.wrappers import ModelWrapper  # All custom models must inherit from this class
from textattack.attack_recipes import TextFoolerJin2019  # Import the default attack recipe (TextFooler is a classic adversarial attack algorithm for text)
from textattack import Attacker, AttackArgs  # For configuring and executing the attack process
from textattack.datasets import Dataset  # Used to wrap test data in TextAttack-compatible format
from textattack.attack_results import SuccessfulAttackResult, FailedAttackResult  # To check whether the attack succeeded

# Define a custom model wrapper
# TextAttack interacts with models through a unified interface, so we wrap our sentiment classifier accordingly
class SentimentWrapper(ModelWrapper):
    def __init__(self):
        self.model = model  # Use the previously loaded DistilBERT model

    def __call__(self, text_inputs):
        outputs = []
        for text in text_inputs:
            # For each input, call the previously defined predict_sentiment() function
            result = predict_sentiment(text)
            # Extract negative and positive probabilities as the model's prediction output
            outputs.append([
                result['probabilities']['negative'],
                result['probabilities']['positive']
            ])
        # TextAttack expects the model to return a tensor (each row is [neg_prob, pos_prob])
        return torch.tensor(outputs)

# Create a test dataset, where each item is (sentence, label)
# Label 1 = positive, 0 = negative
dataset = Dataset([
    ("This movie is great and amazing!", 1),
    ("This was a terrible waste of time.", 0),
    ("I really enjoyed watching this film.", 1),
    ("The worst movie I've ever seen.", 0)
])

# Initialize the attack module
model_wrapper = SentimentWrapper()  # Wrap the model in a TextAttack-compatible format
attack = TextFoolerJin2019.build(model_wrapper)  # Use the TextFooler attack strategy

# Configure attack parameters
attack_args = AttackArgs(
    num_examples=4,        # Number of samples to attack
    disable_stdout=True    # Suppress detailed output (set to False to see full logs)
)

# Create the attacker and start the attack process
attacker = Attacker(attack, dataset, attack_args)
results = attacker.attack_dataset()  # Run the attack and collect results

# Print the results for each example
for i, result in enumerate(results, 1):
    print(f"\nExample {i}:")
    print(f"Original:  {result.original_text()}")  # Show original input text
    if isinstance(result, SuccessfulAttackResult):
        print(f"Attacked:  {result.perturbed_text()}")  # Show the adversarial (perturbed) text
    else:
        print("Attack failed.")  # If the model was not fooled, the attack is considered failed


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 3 / 1 / 0 / 4: 100%|██████████| 4/4 [00:17<00:00,  4.31s/it]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 3      |
| Number of failed attacks:     | 1      |
| Number of skipped attacks:    | 0      |
| Original accuracy:            | 100.0% |
| Accuracy under attack:        | 25.0%  |
| Attack success rate:          | 75.0%  |
| Average perturbed word %:     | 25.4%  |
| Average num. words per input: | 6.25   |
| Avg num queries:              | 90.25  |
+-------------------------------+--------+

Example 1:
Original:  This movie is great and amazing!
Attack failed.

Example 2:
Original:  This was a terrible waste of time.
Attacked:  This was a towering jingles of date.

Example 3:
Original:  I really enjoyed watching this film.
Attacked:  I really rained watching this film.

Example 4:
Original:  The worst movie I've ever seen.
Attacked:  The finest movie I've ever seen.



