In [9]:
!pip install -q textattack==0.3.10



In [5]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax  # 用來將模型的輸出轉為機率值

# 指定我們要使用的預訓練模型名稱與其特定版本（revision）
model_name = "distilbert-base-uncased-finetuned-sst-2-english"  # 這是針對情感分類任務微調過的 DistilBERT 模型
revision = "714eb0fa89d2f80546fda750413ed43d93601a13"  # 特定版本的 checkpoint，可確保 reproducibility（可重現性）

# 根據是否有可用的 GPU，自動選擇運行設備
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")  # 印出目前使用的是 CPU 還是 GPU

# 載入對應模型的 tokenizer（用於將文字轉為模型可以理解的張量）
tokenizer = DistilBertTokenizer.from_pretrained(model_name, revision=revision)

# 載入預訓練並微調好的模型，並移動到指定裝置（CPU/GPU）
model = DistilBertForSequenceClassification.from_pretrained(model_name, revision=revision).to(device)

# 將模型設為評估模式（evaluation mode），以停用 dropout 等訓練專用的機制
model.eval()

# 定義一個函式：輸入文字，回傳預測情感結果
def predict_sentiment(text):
    # 將輸入的文字轉換為模型需要的格式（包含 padding 與 truncation）
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # 把每個 tensor 搬到正確的裝置上（GPU 或 CPU）
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 使用 torch.no_grad() 停用梯度計算，以節省記憶體與加速推論
    with torch.no_grad():
        outputs = model(**inputs)  # 執行前向傳播（forward pass）
        predictions = softmax(outputs.logits, dim=1)  # 將模型輸出的 logits 轉換為機率分布

    # 取得每個類別的機率（索引 0 為 negative，1 為 positive）
    negative_prob = predictions[0][0].item()
    positive_prob = predictions[0][1].item()

    # 根據哪個機率高來決定情感標籤（正面或負面）
    label = "POSITIVE" if positive_prob > negative_prob else "NEGATIVE"
    confidence = positive_prob if label == "POSITIVE" else negative_prob

    # 將預測結果包裝成 dictionary 回傳
    return {
        "label": label,  # 預測的情感標籤
        "confidence": confidence,  # 對該標籤的信心分數（較高的機率）
        "probabilities": {
            "negative": negative_prob,  # 負面機率
            "positive": positive_prob   # 正面機率
        }
    }


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [3]:
from IPython.display import display
import ipywidgets as widgets

def on_submit(b):
    text = text_input.value
    result = predict_sentiment(text)
    print(f"\nText: {text}")
    print(f"Sentiment: {result['label']}")
    print(f"Confidence: {result['confidence']:.4f}")
    print(f"Negative probability: {result['probabilities']['negative']:.4f}")
    print(f"Positive probability: {result['probabilities']['positive']:.4f}")

text_input = widgets.Textarea(
    value="This movie is fantastic!",
    placeholder='Enter a sentence...',
    description='Text:',
    layout=widgets.Layout(width='100%', height='80px')
)

button = widgets.Button(description="Predict Sentiment")
button.on_click(on_submit)

display(text_input, button)


Textarea(value='This movie is fantastic!', description='Text:', layout=Layout(height='80px', width='100%'), pl…

Button(description='Predict Sentiment', style=ButtonStyle())


Text: This movie is fantastic!
Sentiment: POSITIVE
Confidence: 0.9999
Negative probability: 0.0001
Positive probability: 0.9999


In [7]:
# 匯入 TextAttack 提供的模型包裝器介面與攻擊工具套件
from textattack.models.wrappers import ModelWrapper  # 所有自訂模型都需繼承這個類別
from textattack.attack_recipes import TextFoolerJin2019  # 匯入預設的攻擊策略（TextFooler 是經典的文字對抗攻擊演算法）
from textattack import Attacker, AttackArgs  # 用於設定攻擊流程與參數
from textattack.datasets import Dataset  # 用於包裝測試資料成 TextAttack 格式
from textattack.attack_results import SuccessfulAttackResult, FailedAttackResult  # 用來檢查攻擊是否成功

# 建立自訂的模型包裝器（Wrapper）
# TextAttack 必須透過一個統一介面呼叫模型預測結果，因此我們需將情感分類模型包裝成符合接口的類別
class SentimentWrapper(ModelWrapper):
    def __init__(self):
        self.model = model  # 引用之前載入好的 DistilBERT 模型

    def __call__(self, text_inputs):
        outputs = []
        for text in text_inputs:
            # 對每一個輸入文字，呼叫先前定義的 predict_sentiment() 進行情感預測
            result = predict_sentiment(text)
            # 將負面與正面的機率作為模型預測結果回傳
            outputs.append([
                result['probabilities']['negative'],
                result['probabilities']['positive']
            ])
        # TextAttack 需要回傳一個 tensor（每一列為 [neg_prob, pos_prob]）
        return torch.tensor(outputs)

# 建立測試資料集，每一筆是 (句子, 真實標籤)
# 標籤為 1 表示 positive，0 表示 negative
dataset = Dataset([
    ("This movie is great and amazing!", 1),
    ("This was a terrible waste of time.", 0),
    ("I really enjoyed watching this film.", 1),
    ("The worst movie I've ever seen.", 0)
])

# 初始化攻擊模組
model_wrapper = SentimentWrapper()  # 將原本的模型包裝成符合 TextAttack 的格式
attack = TextFoolerJin2019.build(model_wrapper)  # 套用 TextFooler 攻擊策略

# 設定攻擊參數
attack_args = AttackArgs(
    num_examples=4,  # 設定攻擊幾筆資料
    disable_stdout=True  # 不顯示詳細輸出（如需看過程可改為 False）
)

# 建立攻擊者物件並開始攻擊
attacker = Attacker(attack, dataset, attack_args)
results = attacker.attack_dataset()  # 執行攻擊流程，回傳每一筆資料的攻擊結果

# 印出每一筆攻擊結果
for i, result in enumerate(results, 1):
    print(f"\nExample {i}:")
    print(f"Original:  {result.original_text()}")  # 顯示原始句子
    # 判斷攻擊是否成功（模型被誤導）
    if isinstance(result, SuccessfulAttackResult):
        print(f"Attacked:  {result.perturbed_text()}")  # 顯示攻擊後的句子
    else:
        print("Attack failed.")  # 攻擊失敗代表模型仍正確辨識情感


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:07<00:21,  7.30s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 1 / 0 / 1:  25%|██▌       | 1/4 [00:07<00:21,  7.30s/it][A
[Succeeded / Failed / Skipped / Total] 0 / 1 / 0 / 1:  50%|█████     | 2/4 [00:08<00:08,  4.00s/it][A
[Succeeded / Failed / Skipped / Total] 1 / 1 / 0 / 2:  50%|█████     | 2/4 [00:08<00:08,  4.00s/it][A
[Succeeded / Failed / Skipped / Total] 1 / 1 / 0 / 2:  75%|███████▌  | 3/4 [00:08<00:02,  2.73s/it][A
[Succeeded / Failed / Skipped / Total] 2 / 1 / 0 / 3:  75%|███████▌  | 3/4 [00:08<00:02,  2.73s/it][A
[Succeeded / Failed / Skipped / Total] 2 / 1 / 0 / 3: 100%|██████████| 4/4 [00:08<00:00,  2.11s/it][A
[Succeeded / Failed / Skipped / Total] 3 / 1 / 0 / 4: 100%|██████████| 4/4 [00:08<00:00,  2.11s/it]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 3      |
| Number of failed attacks:     | 1      |
| Number of skipped attacks:    | 0      |
| Original accuracy:            | 100.0% |
| Accuracy under attack:        | 25.0%  |
| Attack success rate:          | 75.0%  |
| Average perturbed word %:     | 25.4%  |
| Average num. words per input: | 6.25   |
| Avg num queries:              | 90.25  |
+-------------------------------+--------+

Example 1:
Original:  This movie is great and amazing!
Attack failed.

Example 2:
Original:  This was a terrible waste of time.
Attacked:  This was a towering jingles of date.

Example 3:
Original:  I really enjoyed watching this film.
Attacked:  I really rained watching this film.

Example 4:
Original:  The worst movie I've ever seen.
Attacked:  The finest movie I've ever seen.



