In [1]:
import torch
from torch import nn
import os
import random
from torch.utils import data
from tqdm import tqdm
import numpy as np
from copy import deepcopy
from transformers import RobertaTokenizer, RobertaModel

global extracted_grads

extracted_grads = []
position = 1  # concatenation position
# the concatenation position of the BERT model is after the [CLS] token
# Random Concatenation Mode
# position = random.randint(1,500)

tokenize = RobertaTokenizer.from_pretrained("/root/roberta")
model = RobertaModel.from_pretrained("/root/roberta")


# Load model related information

# Print the number of Total Parameters
# total = [param.nelement() for param in Model.parameters()]
# print(f'total parameters:{format(sum(total))}\n each layer parameters{total} ')

  return self.fget.__get__(instance, owner)()


In [2]:
### Load data

def read_data(data_dir, is_train):
    data, labels = [], []
    for label in ('neg', 'pos'):
        data_path = os.path.join(data_dir, 'train' if is_train else 'test', label)
        for file in os.listdir(data_path):
            with open(os.path.join(data_path, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', ' ')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels


def read_test_data_pos(data_dir, is_train):
    data, labels = [], []
    label = 'pos'  # choose a label to attack
    data_path = os.path.join(data_dir, 'train' if is_train else 'test', label)
    for file in os.listdir(data_path):
        with open(os.path.join(data_path, file), 'rb') as f:
            review = f.read().decode('utf-8').replace('\n', ' ')
            data.append(review)
            labels.append(1 if label == 'pos' else 0)
    return data, labels


def read_test_data_neg(data_dir, is_train):
    data, labels = [], []
    label = 'neg'  # choose a label to attack
    data_path = os.path.join(data_dir, 'train' if is_train else 'test', label)
    for file in os.listdir(data_path):
        with open(os.path.join(data_path, file), 'rb') as f:
            review = f.read().decode('utf-8').replace('\n', ' ')
            data.append(review)
            labels.append(1 if label == 'pos' else 0)
    return data, labels


def load_imdb_data_pos(batch_size, num_steps=500):
    data_dir = 'aclImdb'
    train_data = read_data(data_dir, True)
    test_data = read_test_data_pos(data_dir, False)

    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)

    train_labels = torch.tensor(train_data[1])
    test_labels = torch.tensor(test_data[1])

    train_dataset = data.TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
    test_dataset = data.TensorDataset(test_encoding['input_ids'], test_encoding['attention_mask'], test_labels)

    train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader


def load_imdb_data_neg(batch_size, num_steps=500):
    data_dir = 'aclImdb'
    train_data = read_data(data_dir, True)
    test_data = read_test_data_neg(data_dir, False)

    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)

    train_labels = torch.tensor(train_data[1])
    test_labels = torch.tensor(test_data[1])

    train_dataset = data.TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
    test_dataset = data.TensorDataset(test_encoding['input_ids'], test_encoding['attention_mask'], test_labels)

    train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader



train_loader, test_loader_pos = load_imdb_data_pos(batch_size=10)
train_loader, test_loader_neg = load_imdb_data_neg(batch_size=10)
print("reading data finished")

reading data finished


In [3]:
import torch.optim as optim
from tqdm import tqdm

# 设定随机种子，以确保实验结果可复现
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定义模型
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta")
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

model = SentimentClassifier(num_classes=2).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-6)

def train_model(model, train_loader, criterion, optimizer, num_epochs=3):
    train_losses = []
    train_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with tqdm(train_loader, unit="batch") as tepoch:
            tepoch.set_description(f"Epoch {epoch+1}/{num_epochs}")

            for input_ids, attention_mask, labels in tepoch:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                optimizer.zero_grad()

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()
                total_predictions += labels.size(0)

                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                tepoch.set_postfix(loss=loss.item())

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = correct_predictions / total_predictions * 100
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    return train_losses, train_accuracies

# 训练模型
train_model(model, train_loader, criterion, optimizer, num_epochs=3)

Epoch 1/3: 100%|██████████| 2500/2500 [05:42<00:00,  7.29batch/s, loss=0.0471] 


Epoch 1/3, Loss: 0.2026, Accuracy: 91.82%


Epoch 2/3: 100%|██████████| 2500/2500 [05:43<00:00,  7.27batch/s, loss=0.0863] 


Epoch 2/3, Loss: 0.1218, Accuracy: 95.70%


Epoch 3/3: 100%|██████████| 2500/2500 [05:43<00:00,  7.28batch/s, loss=0.0118] 

Epoch 3/3, Loss: 0.0831, Accuracy: 97.21%





([0.202579672957398, 0.12180150676416233, 0.08305188301452436],
 [91.824, 95.696, 97.208])

In [4]:
torch.save(model, 'roberta_IMDB.bin')

In [6]:
# 测试模型
def test_model(model, test_loader, criterion):
    test_losses = []
    test_accuracies = []
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with tqdm(test_loader, unit="batch") as ttest:
        ttest.set_description(f"Testing")

        for input_ids, attention_mask, labels in ttest:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

            running_loss += loss.item()
            ttest.set_postfix(loss=loss.item())

    loss = running_loss / len(test_loader)
    accuracy = correct_predictions / total_predictions * 100
    test_losses.append(loss)
    test_accuracies.append(accuracy)

    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.2f}%")

    return test_losses, test_accuracies

# 测试模型
test_losses_pos, test_accuracies_pos = test_model(model, test_loader_pos, criterion)
test_losses_neg, test_accuracies_neg = test_model(model, test_loader_neg, criterion)

Testing: 100%|██████████| 1250/1250 [00:52<00:00, 23.84batch/s, loss=0.000889]


Test Loss: 0.0622, Test Accuracy: 98.01%


Testing: 100%|██████████| 1250/1250 [00:52<00:00, 23.89batch/s, loss=0.0428]  

Test Loss: 0.3126, Test Accuracy: 90.58%





In [3]:
def try_all_gpus():
    devices = [torch.device(f'cuda:{i}')
               for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]


# 定义模型
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta")
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

device = try_all_gpus()
Model = torch.load('roberta_IMDB.bin')

In [5]:
criterion = nn.CrossEntropyLoss()
### Trigger Token

def init_trigger_tokens(trigger, num_trigger_tokens):
    # Initialize trigger tokens, we use 'the' as initial trigger token
    trigger_token_ids = [0] * num_trigger_tokens  # 1996 means 'the'
    trigger_token_tensor = torch.tensor(trigger_token_ids)
    return trigger_token_tensor


def evaluate(net, test_iter, trigger_token_tensor):
    # evaluate the accuracy of the model after concatenating the initial trigger token
    net = net.to(device[0])
    net.eval()
    valid_accs = []
    n = torch.tensor([0] * len(trigger_token_tensor))
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = n.unsqueeze(0)
    with torch.no_grad():
        for batch in tqdm(test_iter):
            a, b, y = batch
            a = torch.cat((a[:, :position], m.repeat_interleave(a.shape[0], dim=0), a[:, position:]), dim=1)
            b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0), b[:, position:]), dim=1)
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            # outputs = net(input_ids=a, token_type_ids=b)
            # acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
            # logits = net(input_ids = a, token_type_ids = b)
            # acc = (logits.argmax(dim=-1) == y).float().mean()
            outputs = net(input_ids=a, attention_mask=b)
            acc = (outputs.argmax(dim=-1) == y).float().mean()
            
            valid_accs.append(acc)
    valid_acc = sum(valid_accs) / len(test_iter)
    return valid_acc

def extract_grad_hook(net, grad_in, grad_out):  # store the gradient in extracted_grads
    extracted_grads.append(grad_out[0].mean(dim=0))


def add_hook(net):
    for module in net.modules():
        if isinstance(module, nn.Embedding):
            hook = module.register_backward_hook(extract_grad_hook)
            break
    return hook


def get_gradient(net, test_iter, trigger_token_tensor):  # Calculate the loss to get the gradient
    net = net.to(device[0])
    net.train()
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = torch.tensor([0] * len(trigger_token_tensor))
    n = n.unsqueeze(0)
    optimizer = torch.optim.AdamW(net.parameters())
    for batch in tqdm(test_iter):
        a, b, y = batch
        a = torch.cat((a[:, :position], m.repeat_interleave(a.shape[0], dim=0), a[:, position:]), dim=1)
        b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0), b[:, position:]), dim=1)
        a = a.to(device[0])
        b = b.to(device[0])
        y = y.to(device[0])
        '''
        outputs = net(input_ids=a, token_type_ids=b)
        l = outputs.loss
        optimizer.zero_grad()
        l.backward()
        '''
        '''
        logits = net(input_ids = a, token_type_ids = b)
        loss = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()
        '''
        outputs = net(input_ids = a, attention_mask = b)
        loss = criterion(outputs, y)
        optimizer.zero_grad()
        loss.backward()


def process_gradient(length, num_trigger_tokens):  # Process the gradient to get the average gradient
    extracted_grads_copy = extracted_grads
    extracted_grads_copy[0] = extracted_grads_copy[0]
    temp = extracted_grads_copy[0]
    temp = temp.unsqueeze(0)
    for i in range(1, length - 1):
        extracted_grads_copy[i] = extracted_grads_copy[i]
        extracted_grads_copy[i] = extracted_grads_copy[i].unsqueeze(0)
        temp = torch.cat((temp, extracted_grads_copy[i]), dim=0)
    average_grad = temp.mean(dim=0)[position:position + num_trigger_tokens]
    return average_grad


def hotflip_attack(averaged_grad, embedding_matrix,
                   num_candidates=1, increase_loss=False):
    averaged_grad = averaged_grad.cpu()
    embedding_matrix = embedding_matrix.cpu()
    averaged_grad = averaged_grad.unsqueeze(0)
    gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik",
                                                 (averaged_grad, embedding_matrix))
    if not increase_loss:
        gradient_dot_embedding_matrix *= -1
        # lower versus increase the class probability.
    if num_candidates > 1:  # get top k options
        _, best_k_ids = torch.topk(gradient_dot_embedding_matrix, num_candidates, dim=2)
        return best_k_ids.detach().cpu().numpy()[0]  # Return candidates
    _, best_at_each_step = gradient_dot_embedding_matrix.max(2)
    return best_at_each_step[0].detach().cpu().numpy()


def collection_attack(net, test_iter, num_candidates, num_epoch, trigger='the',  # Summarize each function
                      num_trigger_tokens=3):
    trigger_token_tensor = init_trigger_tokens(trigger, num_trigger_tokens)
    print(f'Concatenation location:{position}')
    valid_acc = evaluate(net, test_iter, trigger_token_tensor)
    print(f'Initial trigger tokens state：the accuracy {valid_acc:.5f}')
    embedding_weight = get_embedding_weight(net)
    for i in range(num_epoch):
        extracted_grads.clear()
        hook = add_hook(net)
        get_gradient(net, test_iter, trigger_token_tensor)
        hook.remove()
        average_grad = process_gradient(len(test_iter), num_trigger_tokens)
        hot_token = hotflip_attack(average_grad, embedding_weight, num_candidates, increase_loss=True)
        hot_token_tensor = torch.from_numpy(hot_token)
        trigger_token_tensor, valid_acc = select_best_candid(net, test_iter, hot_token_tensor, trigger_token_tensor,
                                                             valid_acc)
        print(f'after {i + 1} rounds of attacking\ntriggers: {trigger_token_tensor} \nthe accuracy :{valid_acc:.5f} ')
    return trigger_token_tensor, valid_acc  # Return the final trigger tokens (trigger length) and the accuracy after the attack


def get_embedding_weight(net):
    for module in net.modules():
        if isinstance(module, nn.Embedding):
            weight = module.weight
            break
    return weight


def select_best_candid(net, test_iter, candid_trigger, trigger_token, valid_acc):
    # Concatenate each candidate to each input to determine the final trigger token
    n = torch.tensor([0] * len(trigger_token))
    n = n.unsqueeze(0)
    trigger_token = trigger_token.unsqueeze(0)
    net.eval()
    valid_accs = []
    for i in range(candid_trigger.shape[0]):
        trigger_token_temp = deepcopy(trigger_token)
        for j in range(candid_trigger.shape[1]):
            trigger_token_temp[0, i] = candid_trigger[i, j]
            valid_accs = []
            for batch in tqdm(test_iter):
                a, b, y = batch
                a = torch.cat((a[:, :position], trigger_token_temp.repeat_interleave(a.shape[0], dim=0),
                               a[:, position:]), dim=1)
                b = torch.cat((b[:, :position], n.repeat_interleave(b.shape[0], dim=0),
                               b[:, position:]), dim=1)
                a = a.to(device[0])
                b = b.to(device[0])
                y = y.to(device[0])
                #outputs = net(input_ids=a, token_type_ids=b)
                #acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
                # logits = net(input_ids = a, token_type_ids = b)
                # acc = (logits.argmax(dim=-1) == y).float().mean()
                outputs = net(input_ids=a, attention_mask=b)
                acc = (outputs.argmax(dim=-1) == y).float().mean()    
                
                valid_accs.append(acc)
            temp = sum(valid_accs) / len(test_iter)
            if temp < valid_acc:
                valid_acc = temp
                trigger_token[0, i] = candid_trigger[i, j]
    return trigger_token[0], valid_acc  # Return the final trigger token and the accuracy after the attack

#collection_attack(Model, test_loader_pos, 5, 5, trigger='<pad>', num_trigger_tokens=1)
collection_attack(Model, test_loader_pos, 5, 5, trigger='<pad>', num_trigger_tokens=2)
#collection_attack(Model, test_loader_pos, 5, 10, trigger='<pad>', num_trigger_tokens=3)

Concatenation location:1


100%|██████████| 1250/1250 [00:49<00:00, 25.18it/s]


Initial trigger tokens state：the accuracy 0.98048


100%|██████████| 1250/1250 [02:30<00:00,  8.28it/s]
100%|██████████| 1250/1250 [00:50<00:00, 24.99it/s]
 41%|████▏     | 518/1250 [00:20<00:29, 24.97it/s]


KeyboardInterrupt: 