In [1]:
import torch
from torch import nn
import os
import random
from torch.utils import data
from tqdm import tqdm
import numpy as np
from copy import deepcopy
from transformers import AlbertTokenizer, AlbertModel
import warnings
import csv
import re

warnings.filterwarnings('ignore')

global extracted_grads

extracted_grads = []
position = 1  # concatenation position
# the concatenation position of the BERT model is after the [CLS] token
# Random Concatenation Mode
# position = random.randint(1,500)

tokenize = AlbertTokenizer.from_pretrained("/root/albert")
Model = AlbertModel.from_pretrained("/root/albert")

# Load model related information

# Print the number of Total Parameters
# total = [param.nelement() for param in Model.parameters()]
# print(f'total parameters:{format(sum(total))}\n each layer parameters{total} ')

In [2]:
'''
SNLI Data
'''


### Load data

def extract_text(s):
    # 移除括号
    s = re.sub('\\(', '', s)
    s = re.sub('\\)', '', s)
    # 使用一个空格替换两个以上连续空格
    s = re.sub('\\s{2,}', ' ', s)
    return s.strip()


def read_snli_binary_data(data_dir, is_train):
    """读取SNLI二分类数据集"""
    label_set = {'entailment': 0, 'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels


def read_snli_binary_test_data_ent(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    label_set = {'entailment': 0}
    # label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels

def read_snli_binary_test_data_con(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    # label_set = {'entailment': 0}
    label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels


def load_snli_array(data_arrays, batch_size, is_train=True):
    """Constructs a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)


def load_snli_data_ent(train_batch_size, test_batch_iter, num_steps=500):
    train_data = read_snli_binary_data('snli_1.0', is_train=True)
    test_data = read_snli_binary_test_data_ent('snli_1.0', is_train=False)
    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    train_iter = load_snli_array(
        (train_encoding['input_ids'], train_encoding['token_type_ids'], torch.tensor(train_data[1])),
        train_batch_size)
    test_iter = load_snli_array(
        (test_encoding['input_ids'], test_encoding['token_type_ids'], torch.tensor(test_data[1])),
        test_batch_iter,
        is_train=False)
    return train_iter, test_iter

def load_snli_data_con(train_batch_size, test_batch_iter, num_steps=500):
    train_data = read_snli_binary_data('snli_1.0', is_train=True)
    test_data = read_snli_binary_test_data_con('snli_1.0', is_train=False)
    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    train_iter = load_snli_array(
        (train_encoding['input_ids'], train_encoding['token_type_ids'], torch.tensor(train_data[1])),
        train_batch_size)
    test_iter = load_snli_array(
        (test_encoding['input_ids'], test_encoding['token_type_ids'], torch.tensor(test_data[1])),
        test_batch_iter,
        is_train=False)
    return train_iter, test_iter

In [3]:
# train_iter, test_iter = load_imdb_data(10)
# train_iter, test_iter = load_sst_data(10)
train_iter, test_iter_ent = load_snli_data_ent(20, 3)
train_iter, test_iter_con = load_snli_data_con(20, 3)
# Data preprocessing and loading
print("reading data finished\n")
print(len(train_iter))
print(len(test_iter_ent))
print(len(test_iter_con))

reading data finished

18331
1123
1079


In [4]:
# Define the model architecture
class AlbertSentimentClassifier(nn.Module):
    def __init__(self, albert_model):
        super(AlbertSentimentClassifier, self).__init__()
        self.albert = albert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.albert.config.hidden_size, 2)  # Binary classification: positive or negative

    def forward(self, input_ids, token_type_ids):
        outputs = self.albert(input_ids=input_ids, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Take the [CLS] token output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Instantiate the model
model = AlbertSentimentClassifier(Model)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

num_epochs = 3  # Example, you can adjust this
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (input_ids, token_type_ids, labels) in enumerate(train_iter):
        input_ids, token_type_ids, labels = input_ids.to(device), token_type_ids.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (batch_idx + 1) % 100 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_iter)}, Loss: {total_loss / (batch_idx+1):.4f}")

print("Training finished.")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/3, Batch 100/18331, Loss: 0.7031
Epoch 1/3, Batch 200/18331, Loss: 0.7025
Epoch 1/3, Batch 300/18331, Loss: 0.7007
Epoch 1/3, Batch 400/18331, Loss: 0.7003
Epoch 1/3, Batch 500/18331, Loss: 0.6998
Epoch 1/3, Batch 600/18331, Loss: 0.6994
Epoch 1/3, Batch 700/18331, Loss: 0.6991
Epoch 1/3, Batch 800/18331, Loss: 0.6985
Epoch 1/3, Batch 900/18331, Loss: 0.6991
Epoch 1/3, Batch 1000/18331, Loss: 0.6985
Epoch 1/3, Batch 1100/18331, Loss: 0.6903
Epoch 1/3, Batch 1200/18331, Loss: 0.6746
Epoch 1/3, Batch 1300/18331, Loss: 0.6550
Epoch 1/3, Batch 1400/18331, Loss: 0.6357
Epoch 1/3, Batch 1500/18331, Loss: 0.6149
Epoch 1/3, Batch 1600/18331, Loss: 0.5975
Epoch 1/3, Batch 1700/18331, Loss: 0.5802
Epoch 1/3, Batch 1800/18331, Loss: 0.5648
Epoch 1/3, Batch 1900/18331, Loss: 0.5497
Epoch 1/3, Batch 2000/18331, Loss: 0.5369
Epoch 1/3, Batch 2100/18331, Loss: 0.5241
Epoch 1/3, Batch 2200/18331, Loss: 0.5135
Epoch 1/3, Batch 2300/18331, Loss: 0.5034
Epoch 1/3, Batch 2400/18331, Loss: 0.4935
E

In [5]:
torch.save(model, 'albert_SNLI.bin')

In [6]:
def evaluate_model(model, test_iter):
    model.eval()
    device = next(model.parameters()).device

    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for input_ids, token_type_ids, labels in test_iter:
            input_ids, token_type_ids, labels = input_ids.to(device), token_type_ids.to(device), labels.to(device)

            logits = model(input_ids, token_type_ids)
            _, predictions = torch.max(logits, 1)

            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f"Accuracy on test set: {accuracy:.4f}")

# Evaluate the model
evaluate_model(model, test_iter_ent)
evaluate_model(model, test_iter_con)

Accuracy on test set: 0.9501
Accuracy on test set: 0.9549


In [6]:
def try_all_gpus():
    devices = [torch.device(f'cuda:{i}')
               for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

# Define the model architecture
class AlbertSentimentClassifier(nn.Module):
    def __init__(self, albert_model):
        super(AlbertSentimentClassifier, self).__init__()
        self.albert = albert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.albert.config.hidden_size, 2)  # Binary classification: positive or negative

    def forward(self, input_ids, token_type_ids):
        outputs = self.albert(input_ids=input_ids, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Take the [CLS] token output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

device = try_all_gpus()
Model = torch.load('albert_SNLI.bin')

In [7]:
criterion = nn.CrossEntropyLoss()
### Trigger Token

def init_trigger_tokens(trigger, num_trigger_tokens):
    # Initialize trigger tokens, we use 'the' as initial trigger token
    trigger_token_ids = [0] * num_trigger_tokens  # 1996 means 'the'
    trigger_token_tensor = torch.tensor(trigger_token_ids)
    return trigger_token_tensor


def evaluate(net, test_iter, trigger_token_tensor):
    # evaluate the accuracy of the model after concatenating the initial trigger token
    net = net.to(device[0])
    net.eval()
    valid_accs = []
    n = torch.tensor([0] * len(trigger_token_tensor))
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = n.unsqueeze(0)
    with torch.no_grad():
        for batch in tqdm(test_iter):
            a, b, y = batch
            a = torch.cat((a[:, :position], m.repeat_interleave(a.shape[0], dim=0), a[:, position:]), dim=1)
            b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0), b[:, position:]), dim=1)
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            # outputs = net(input_ids=a, token_type_ids=b)
            # acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
            logits = net(input_ids = a, token_type_ids = b)
            acc = (logits.argmax(dim=-1) == y).float().mean()
            valid_accs.append(acc)
    valid_acc = sum(valid_accs) / len(test_iter)
    return valid_acc

def extract_grad_hook(net, grad_in, grad_out):  # store the gradient in extracted_grads
    extracted_grads.append(grad_out[0].mean(dim=0))


def add_hook(net):
    for module in net.modules():
        if isinstance(module, nn.Embedding):
            hook = module.register_backward_hook(extract_grad_hook)
            break
    return hook


def get_gradient(net, test_iter, trigger_token_tensor):  # Calculate the loss to get the gradient
    net = net.to(device[0])
    net.train()
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = torch.tensor([0] * len(trigger_token_tensor))
    n = n.unsqueeze(0)
    optimizer = torch.optim.AdamW(net.parameters())
    for batch in tqdm(test_iter):
        a, b, y = batch
        a = torch.cat((a[:, :position], m.repeat_interleave(a.shape[0], dim=0), a[:, position:]), dim=1)
        b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0), b[:, position:]), dim=1)
        a = a.to(device[0])
        b = b.to(device[0])
        y = y.to(device[0])
        '''
        outputs = net(input_ids=a, token_type_ids=b)
        l = outputs.loss
        optimizer.zero_grad()
        l.backward()
        '''
        logits = net(input_ids = a, token_type_ids = b)
        loss = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()


def process_gradient(length, num_trigger_tokens):  # Process the gradient to get the average gradient
    extracted_grads_copy = extracted_grads
    extracted_grads_copy[0] = extracted_grads_copy[0]
    temp = extracted_grads_copy[0]
    temp = temp.unsqueeze(0)
    for i in range(1, length - 1):
        extracted_grads_copy[i] = extracted_grads_copy[i]
        extracted_grads_copy[i] = extracted_grads_copy[i].unsqueeze(0)
        temp = torch.cat((temp, extracted_grads_copy[i]), dim=0)
    average_grad = temp.mean(dim=0)[position:position + num_trigger_tokens]
    return average_grad


def hotflip_attack(averaged_grad, embedding_matrix,
                   num_candidates=1, increase_loss=False):
    averaged_grad = averaged_grad.cpu()
    embedding_matrix = embedding_matrix.cpu()
    averaged_grad = averaged_grad.unsqueeze(0)
    gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik",
                                                 (averaged_grad, embedding_matrix))
    if not increase_loss:
        gradient_dot_embedding_matrix *= -1
        # lower versus increase the class probability.
    if num_candidates > 1:  # get top k options
        _, best_k_ids = torch.topk(gradient_dot_embedding_matrix, num_candidates, dim=2)
        return best_k_ids.detach().cpu().numpy()[0]  # Return candidates
    _, best_at_each_step = gradient_dot_embedding_matrix.max(2)
    return best_at_each_step[0].detach().cpu().numpy()


def collection_attack(net, test_iter, num_candidates, num_epoch, trigger='the',  # Summarize each function
                      num_trigger_tokens=3):
    trigger_token_tensor = init_trigger_tokens(trigger, num_trigger_tokens)
    print(f'Concatenation location:{position}')
    valid_acc = evaluate(net, test_iter, trigger_token_tensor)
    print(f'Initial trigger tokens state：the accuracy {valid_acc:.5f}')
    embedding_weight = get_embedding_weight(net)
    for i in range(num_epoch):
        extracted_grads.clear()
        hook = add_hook(net)
        get_gradient(net, test_iter, trigger_token_tensor)
        hook.remove()
        average_grad = process_gradient(len(test_iter), num_trigger_tokens)
        hot_token = hotflip_attack(average_grad, embedding_weight, num_candidates, increase_loss=True)
        hot_token_tensor = torch.from_numpy(hot_token)
        trigger_token_tensor, valid_acc = select_best_candid(net, test_iter, hot_token_tensor, trigger_token_tensor,
                                                             valid_acc)
        print(f'after {i + 1} rounds of attacking\ntriggers: {trigger_token_tensor} \nthe accuracy :{valid_acc:.5f} ')
    return trigger_token_tensor, valid_acc  # Return the final trigger tokens (trigger length) and the accuracy after the attack


def get_embedding_weight(net):
    for module in net.modules():
        if isinstance(module, nn.Embedding):
            weight = module.weight
            break
    return weight


def select_best_candid(net, test_iter, candid_trigger, trigger_token, valid_acc):
    # Concatenate each candidate to each input to determine the final trigger token
    n = torch.tensor([0] * len(trigger_token))
    n = n.unsqueeze(0)
    trigger_token = trigger_token.unsqueeze(0)
    net.eval()
    valid_accs = []
    for i in range(candid_trigger.shape[0]):
        trigger_token_temp = deepcopy(trigger_token)
        for j in range(candid_trigger.shape[1]):
            trigger_token_temp[0, i] = candid_trigger[i, j]
            valid_accs = []
            for batch in tqdm(test_iter):
                a, b, y = batch
                a = torch.cat((a[:, :position], trigger_token_temp.repeat_interleave(a.shape[0], dim=0),
                               a[:, position:]), dim=1)
                b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0),
                               b[:, position:]), dim=1)
                a = a.to(device[0])
                b = b.to(device[0])
                y = y.to(device[0])
                #outputs = net(input_ids=a, token_type_ids=b)
                #acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
                logits = net(input_ids = a, token_type_ids = b)
                acc = (logits.argmax(dim=-1) == y).float().mean()
                valid_accs.append(acc)
            temp = sum(valid_accs) / len(test_iter)
            if temp < valid_acc:
                valid_acc = temp
                trigger_token[0, i] = candid_trigger[i, j]
    return trigger_token[0], valid_acc  # Return the final trigger token and the accuracy after the attack

collection_attack(Model, test_iter_ent, 5, 5, trigger='<pad>', num_trigger_tokens=1)
collection_attack(Model, test_iter_ent, 5, 5, trigger='<pad>', num_trigger_tokens=2)
collection_attack(Model, test_iter_ent, 5, 5, trigger='<pad>', num_trigger_tokens=3)

Concatenation location:1


  0%|          | 0/1123 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 1123/1123 [00:07<00:00, 144.50it/s]


Initial trigger tokens state：the accuracy 0.95073


100%|██████████| 1123/1123 [00:22<00:00, 49.03it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.00it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.48it/s]
100%|██████████| 1123/1123 [00:08<00:00, 126.91it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.41it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.84it/s]


after 1 rounds of attacking
triggers: tensor([6903]) 
the accuracy :0.94835 


100%|██████████| 1123/1123 [00:22<00:00, 49.07it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.71it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.50it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.88it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.08it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.68it/s]


after 2 rounds of attacking
triggers: tensor([29499]) 
the accuracy :0.94360 


100%|██████████| 1123/1123 [00:20<00:00, 53.62it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.32it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.97it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.44it/s]
100%|██████████| 1123/1123 [00:09<00:00, 118.24it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.10it/s]


after 3 rounds of attacking
triggers: tensor([29334]) 
the accuracy :0.94301 


100%|██████████| 1123/1123 [00:23<00:00, 47.88it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.39it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.10it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.92it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.20it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.76it/s]


after 4 rounds of attacking
triggers: tensor([29334]) 
the accuracy :0.94301 


100%|██████████| 1123/1123 [00:24<00:00, 45.01it/s]
100%|██████████| 1123/1123 [00:09<00:00, 116.41it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.07it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.96it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.53it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.29it/s]


after 5 rounds of attacking
triggers: tensor([29334]) 
the accuracy :0.94301 
Concatenation location:1


100%|██████████| 1123/1123 [00:07<00:00, 151.82it/s]


Initial trigger tokens state：the accuracy 0.94954


100%|██████████| 1123/1123 [00:21<00:00, 53.04it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.27it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.05it/s]
100%|██████████| 1123/1123 [00:09<00:00, 118.84it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.12it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.39it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.96it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.52it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.81it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.57it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.00it/s]


after 1 rounds of attacking
triggers: tensor([28810, 22641]) 
the accuracy :0.94776 


100%|██████████| 1123/1123 [00:23<00:00, 48.28it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.90it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.30it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.33it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.49it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.25it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.63it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.94it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.18it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.58it/s]
100%|██████████| 1123/1123 [00:09<00:00, 112.65it/s]


after 2 rounds of attacking
triggers: tensor([11660,  8233]) 
the accuracy :0.91719 


100%|██████████| 1123/1123 [00:23<00:00, 48.77it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.92it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.99it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.05it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.53it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.99it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.03it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.34it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.64it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.68it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.28it/s]


after 3 rounds of attacking
triggers: tensor([11660,  8233]) 
the accuracy :0.91719 


100%|██████████| 1123/1123 [00:22<00:00, 49.60it/s]
100%|██████████| 1123/1123 [00:09<00:00, 115.66it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.24it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.61it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.41it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.85it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.30it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.14it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.21it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.51it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.05it/s]


after 4 rounds of attacking
triggers: tensor([11660,  8233]) 
the accuracy :0.91719 


100%|██████████| 1123/1123 [00:24<00:00, 46.72it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.97it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.16it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.64it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.52it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.62it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.99it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.09it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.00it/s]
100%|██████████| 1123/1123 [00:09<00:00, 118.82it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.77it/s]


after 5 rounds of attacking
triggers: tensor([11660,  8233]) 
the accuracy :0.91719 
Concatenation location:1


100%|██████████| 1123/1123 [00:07<00:00, 153.82it/s]


Initial trigger tokens state：the accuracy 0.94895


100%|██████████| 1123/1123 [00:22<00:00, 49.14it/s]
100%|██████████| 1123/1123 [00:09<00:00, 116.63it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.61it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.64it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.86it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.75it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.89it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.69it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.74it/s]
100%|██████████| 1123/1123 [00:09<00:00, 116.64it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.24it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.38it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.95it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.44it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.80it/s]
100%|██████████| 1123/1123 [00:09<00:00, 119.38it/s]


after 1 rounds of attacking
triggers: tensor([28810, 22641,  6903]) 
the accuracy :0.94242 


100%|██████████| 1123/1123 [00:21<00:00, 51.85it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.95it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.10it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.69it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.32it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.91it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.33it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.48it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.81it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.90it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.80it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.44it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.84it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.95it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.80it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.89it/s]


after 2 rounds of attacking
triggers: tensor([11660, 18948,  7575]) 
the accuracy :0.90472 


100%|██████████| 1123/1123 [00:24<00:00, 45.38it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.54it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.30it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.26it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.72it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.57it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.22it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.79it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.33it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.00it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.20it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.40it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.95it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.35it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.69it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.87it/s]


after 3 rounds of attacking
triggers: tensor([25653, 18948,  7575]) 
the accuracy :0.90413 


100%|██████████| 1123/1123 [00:23<00:00, 47.82it/s]
100%|██████████| 1123/1123 [00:09<00:00, 120.56it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.88it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.75it/s]
100%|██████████| 1123/1123 [00:09<00:00, 118.09it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.45it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.53it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.04it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.53it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.91it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.98it/s]
100%|██████████| 1123/1123 [00:09<00:00, 122.70it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.88it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.14it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.07it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.44it/s]


after 4 rounds of attacking
triggers: tensor([25653, 18948, 12000]) 
the accuracy :0.90057 


100%|██████████| 1123/1123 [00:24<00:00, 46.57it/s]
100%|██████████| 1123/1123 [00:09<00:00, 117.06it/s]
100%|██████████| 1123/1123 [00:09<00:00, 121.99it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.06it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.45it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.63it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.40it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.91it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.69it/s]
100%|██████████| 1123/1123 [00:08<00:00, 124.81it/s]
100%|██████████| 1123/1123 [00:08<00:00, 126.22it/s]
100%|██████████| 1123/1123 [00:09<00:00, 118.74it/s]
100%|██████████| 1123/1123 [00:08<00:00, 125.56it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.71it/s]
100%|██████████| 1123/1123 [00:09<00:00, 123.36it/s]
100%|██████████| 1123/1123 [00:09<00:00, 124.38it/s]

after 5 rounds of attacking
triggers: tensor([25653,  4248, 12000]) 
the accuracy :0.88127 





(tensor([25653,  4248, 12000]), tensor(0.8813, device='cuda:0'))

In [8]:
collection_attack(Model, test_iter_con, 5, 5, trigger='<pad>', num_trigger_tokens=1)
collection_attack(Model, test_iter_con, 5, 5, trigger='<pad>', num_trigger_tokens=2)
collection_attack(Model, test_iter_con, 5, 5, trigger='<pad>', num_trigger_tokens=3)

Concatenation location:1


100%|██████████| 1079/1079 [00:07<00:00, 146.55it/s]


Initial trigger tokens state：the accuracy 0.95335


100%|██████████| 1079/1079 [00:23<00:00, 45.28it/s]
100%|██████████| 1079/1079 [00:09<00:00, 116.60it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.13it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.47it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.08it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.76it/s]


after 1 rounds of attacking
triggers: tensor([23690]) 
the accuracy :0.94223 


100%|██████████| 1079/1079 [00:22<00:00, 47.15it/s]
100%|██████████| 1079/1079 [00:09<00:00, 112.56it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.31it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.52it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.42it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.05it/s]


after 2 rounds of attacking
triggers: tensor([23690]) 
the accuracy :0.94223 


100%|██████████| 1079/1079 [00:23<00:00, 46.44it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.00it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.79it/s]
100%|██████████| 1079/1079 [00:09<00:00, 116.82it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.43it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.86it/s]


after 3 rounds of attacking
triggers: tensor([23690]) 
the accuracy :0.94223 


100%|██████████| 1079/1079 [00:21<00:00, 50.77it/s]
100%|██████████| 1079/1079 [00:09<00:00, 117.91it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.67it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.64it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.10it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.99it/s]


after 4 rounds of attacking
triggers: tensor([23690]) 
the accuracy :0.94223 


100%|██████████| 1079/1079 [00:21<00:00, 50.35it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.74it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.18it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.11it/s]
100%|██████████| 1079/1079 [00:09<00:00, 116.03it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.52it/s]


after 5 rounds of attacking
triggers: tensor([23690]) 
the accuracy :0.94223 
Concatenation location:1


100%|██████████| 1079/1079 [00:07<00:00, 151.00it/s]


Initial trigger tokens state：the accuracy 0.95273


100%|██████████| 1079/1079 [00:22<00:00, 48.56it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.84it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.23it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.88it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.28it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.09it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.52it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.65it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.52it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.62it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.49it/s]


after 1 rounds of attacking
triggers: tensor([    0, 29833]) 
the accuracy :0.95088 


100%|██████████| 1079/1079 [00:22<00:00, 48.75it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.04it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.79it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.47it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.42it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.12it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.12it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.96it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.01it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.73it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.69it/s]


after 2 rounds of attacking
triggers: tensor([28439, 26987]) 
the accuracy :0.93636 


100%|██████████| 1079/1079 [00:21<00:00, 51.22it/s]
100%|██████████| 1079/1079 [00:09<00:00, 116.80it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.32it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.56it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.96it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.99it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.45it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.69it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.66it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.90it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.83it/s]


after 3 rounds of attacking
triggers: tensor([28439, 26987]) 
the accuracy :0.93636 


100%|██████████| 1079/1079 [00:22<00:00, 48.28it/s]
100%|██████████| 1079/1079 [00:09<00:00, 116.95it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.20it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.91it/s]
100%|██████████| 1079/1079 [00:09<00:00, 119.76it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.66it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.26it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.15it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.01it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.70it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.39it/s]


after 4 rounds of attacking
triggers: tensor([28439, 26987]) 
the accuracy :0.93636 


100%|██████████| 1079/1079 [00:23<00:00, 45.53it/s]
100%|██████████| 1079/1079 [00:09<00:00, 117.82it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.38it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.21it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.73it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.42it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.35it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.71it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.94it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.49it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.48it/s]


after 5 rounds of attacking
triggers: tensor([28439, 26987]) 
the accuracy :0.93636 
Concatenation location:1


100%|██████████| 1079/1079 [00:06<00:00, 155.57it/s]


Initial trigger tokens state：the accuracy 0.95181


100%|██████████| 1079/1079 [00:23<00:00, 46.71it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.06it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.82it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.47it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.67it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.34it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.08it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.12it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.18it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.39it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.93it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.08it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.32it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.40it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.25it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.93it/s]


after 1 rounds of attacking
triggers: tensor([26340,     0, 25994]) 
the accuracy :0.94995 


100%|██████████| 1079/1079 [00:23<00:00, 45.46it/s]
100%|██████████| 1079/1079 [00:09<00:00, 118.30it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.78it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.85it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.13it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.28it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.27it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.47it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.53it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.38it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.05it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.29it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.10it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.13it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.33it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.82it/s]


after 2 rounds of attacking
triggers: tensor([29143, 29993, 16309]) 
the accuracy :0.93111 


100%|██████████| 1079/1079 [00:22<00:00, 48.41it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.22it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.10it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.87it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.88it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.64it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.56it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.18it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.08it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.77it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.19it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.13it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.49it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.65it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.25it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.45it/s]


after 3 rounds of attacking
triggers: tensor([22616, 29993, 16309]) 
the accuracy :0.92462 


100%|██████████| 1079/1079 [00:21<00:00, 49.43it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.60it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.87it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.48it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.33it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.32it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.02it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.21it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.74it/s]
100%|██████████| 1079/1079 [00:08<00:00, 120.00it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.12it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.08it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.98it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.04it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.61it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.87it/s]


after 4 rounds of attacking
triggers: tensor([22616, 29993, 16309]) 
the accuracy :0.92462 


100%|██████████| 1079/1079 [00:22<00:00, 48.87it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.82it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.39it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.57it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.41it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.88it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.10it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.91it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.75it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.01it/s]
100%|██████████| 1079/1079 [00:08<00:00, 125.03it/s]
100%|██████████| 1079/1079 [00:08<00:00, 121.13it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.27it/s]
100%|██████████| 1079/1079 [00:08<00:00, 122.27it/s]
100%|██████████| 1079/1079 [00:08<00:00, 124.87it/s]
100%|██████████| 1079/1079 [00:08<00:00, 123.97it/s]

after 5 rounds of attacking
triggers: tensor([22616, 29993, 16309]) 
the accuracy :0.92462 





(tensor([22616, 29993, 16309]), tensor(0.9246, device='cuda:0'))