In [1]:
import torch
from torch import nn
import os
import random
from torch.utils import data
from tqdm import tqdm
import numpy as np
from copy import deepcopy
from transformers import BertTokenizer, BertForSequenceClassification,BertConfig
global extracted_grads
global rand_num
extracted_grads = []
position = 1#concatenation position
#the concatenation position of the BERT model is after the [CLS] token


#Random Concatenation Mode
#position = random.randint(0,100)

In [2]:
BERT_path = '/mnt'#path to bert model
tokenize = BertTokenizer.from_pretrained(os.path.join(BERT_path,'vocab.txt'))
model_config = BertConfig.from_pretrained(os.path.join(BERT_path,'config.json'))
Model = BertForSequenceClassification.from_pretrained(os.path.join(BERT_path,'pytorch_model.bin'),config = model_config)
#Load model related information

Some weights of the model checkpoint at /mnt/pytorch_model.bin were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

In [3]:
#Print the number of Total Parameters
total = [param.nelement() for param in Model.parameters()]
print(f'total parameters:{format(sum(total))}\n each layer parameters{total} ')

total parameters:9591554
 each layer parameters[7813632, 131072, 512, 256, 256, 65536, 256, 65536, 256, 65536, 256, 65536, 256, 256, 256, 262144, 1024, 262144, 256, 256, 256, 65536, 256, 65536, 256, 65536, 256, 65536, 256, 256, 256, 262144, 1024, 262144, 256, 256, 256, 65536, 256, 512, 2] 


In [4]:
def read_data(data_dir,is_train):#for training the model
    data,labels = [],[]
    for label in ('neg','pos'):
        data_path = os.path.join(data_dir,'train' if is_train else 'test',label)
        for file in os.listdir(data_path):
            with open (os.path.join(data_path,file),'rb') as f:
                review = f.read().decode('utf-8').replace('\n',' ')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data,labels

def read_test_data(data_dir,is_train):
    data,labels = [],[]
    label = 'pos'#choose a label to attack
    data_path = os.path.join(data_dir,'train' if is_train else 'test',label)
    for file in os.listdir(data_path):
        with open (os.path.join(data_path,file),'rb') as f:
            review = f.read().decode('utf-8').replace('\n',' ')
            data.append(review)
            labels.append(1 if label == 'pos' else 0)
    return data,labels

def load_array(data_arrays, batch_size, is_train=True):
    """Constructs a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def try_all_gpus():
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

def load_imdb_data(batch_size, num_steps=500):
    data_dir = '/mnt/aclImdb'#Path to download dataset
    train_data = read_data(data_dir,True)
    test_data = read_test_data(data_dir,False)
    train_encoding = tokenize(train_data[0], return_tensors="pt",padding = True,truncation = True,max_length = num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt",padding = True,truncation = True,max_length = num_steps)
    train_iter = load_array((train_encoding['input_ids'],train_encoding['token_type_ids'], torch.tensor(train_data[1])),
                                batch_size)
    test_iter = load_array((test_encoding['input_ids'],test_encoding['token_type_ids'], torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter

In [5]:
def train(net,train_iter,lr,num_epochs,device):
    print('---------------------------start---------------------')
    optimizer = torch.optim.AdamW(net.parameters(),lr=lr)
    net = net.to(device[0])
    for epoch in range(num_epochs):
        net.train()
        print(f' epoch {epoch+1}')
        train_losses = []
        train_accs = []
        train_length = 0
        for batch in tqdm(train_iter):
            a,b, y = batch
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            outputs = net(input_ids = a,token_type_ids = b,labels = y)
            logits = outputs.logits
            l = outputs.loss
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            acc = (logits.argmax(dim=-1) == y).float().mean()
            train_losses.append(l)
            train_accs.append(acc)
            train_length += len(y)
        print("Learning rate for epoch %d：%f" % (epoch+1,optimizer.param_groups[0]['lr']))
        train_loss = sum(train_losses) / len(train_iter)
        train_acc = sum(train_accs) / len(train_iter)
        print(f"[ Train | {epoch + 1:03d}/{num_epochs:03d} ] loss = {train_loss:.5f}   acc = {train_acc:.5f}")
    print('Training process has finished.')
    print('the loss of model {:.3f}'.format(train_loss))

In [6]:
def init_trigger_tokens(trigger,num_trigger_tokens):#Initialize trigger tokens, we use 'the' as initial trigger token
    trigger_token_ids = [1996] * num_trigger_tokens#1996---'the'
    trigger_token_tensor = torch.tensor(trigger_token_ids)
    return trigger_token_tensor

def evaluate(net,test_iter,trigger_token_tensor):#evaluate the accuracy of the model after concatenating the initial trigger token
    net = net.to(device[0])
    net.eval()
    valid_accs = []
    n = torch.tensor([0]*len(trigger_token_tensor))
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = n.unsqueeze(0)
    with torch.no_grad():
        for batch in tqdm(test_iter):
            a,b, y = batch
            a = torch.cat((a[:,:position],m.repeat_interleave(a.shape[0],dim = 0),a[:,position:]),dim = 1)
            b = torch.cat((b[:,:position],n.repeat_interleave(b.shape[0],dim = 0),b[:,position:]),dim = 1)
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            outputs = net(input_ids = a,token_type_ids = b,labels = y)
            acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
            valid_accs.append(acc)
    valid_acc = sum(valid_accs)/len(test_iter)
    return valid_acc

def extract_grad_hook(net, grad_in, grad_out):#store the gradient in extracted_grads
    extracted_grads.append(grad_out[0].mean(dim = 0))
def add_hook(net):
    for module in net.modules():
            if isinstance(module, nn.Embedding):
                hook = module.register_backward_hook(extract_grad_hook)
                break
    return hook

def get_gradient(net,test_iter,trigger_token_tensor):#Calculate the loss to get the gradient
    net = net.to(device[0])
    net.train()
    m = deepcopy(trigger_token_tensor)
    m = m.unsqueeze(0)
    n = torch.tensor([0]*len(trigger_token_tensor))
    n = n.unsqueeze(0)
    optimizer = torch.optim.AdamW(net.parameters())
    for batch in tqdm(test_iter):
        a,b, y = batch
        a = torch.cat((a[:,:position],m.repeat_interleave(a.shape[0],dim = 0),a[:,position:]),dim = 1)
        b = torch.cat((b[:,:position],n.repeat_interleave(b.shape[0],dim = 0),b[:,position:]),dim = 1)
        a = a.to(device[0])
        b = b.to(device[0])
        y = y.to(device[0])
        outputs = net(input_ids = a,token_type_ids = b,labels = y)
        l = outputs.loss
        optimizer.zero_grad()
        l.backward()
    

def process_gradient(length,num_trigger_tokens):#Process the gradient to get the average gradient
    extracted_grads_copy = extracted_grads
    extracted_grads_copy[0] = extracted_grads_copy[0]
    temp = extracted_grads_copy[0]
    temp = temp.unsqueeze(0)
    for i in range(1,length-1):
        extracted_grads_copy[i] = extracted_grads_copy[i]
        extracted_grads_copy[i] = extracted_grads_copy[i].unsqueeze(0)
        temp = torch.cat((temp,extracted_grads_copy[i]),dim = 0)
    average_grad = temp.mean(dim = 0)[position:position+num_trigger_tokens]########
    return average_grad

def hotflip_attack(averaged_grad, embedding_matrix,
                    num_candidates=1,increase_loss=False):#Return candidates according to Equation 3
    averaged_grad = averaged_grad.cpu()
    embedding_matrix = embedding_matrix.cpu()
    averaged_grad = averaged_grad.unsqueeze(0)
    gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik",
                                                 (averaged_grad, embedding_matrix))   #Equation 3
    if not increase_loss:
        gradient_dot_embedding_matrix *= -1 
        # lower versus increase the class probability.
    if num_candidates > 1: # get top k options
        _, best_k_ids = torch.topk(gradient_dot_embedding_matrix, num_candidates, dim=2)
        return best_k_ids.detach().cpu().numpy()[0]#Return candidates
    _, best_at_each_step = gradient_dot_embedding_matrix.max(2)
    return best_at_each_step[0].detach().cpu().numpy()

def get_embedding_weight(net):
    for module in net.modules():
            if isinstance(module, nn.Embedding):
                weight =  module.weight
                break
    return weight

#
def select_best_candid(net,test_iter,candid_trigger,trigger_token,valid_acc):#Concatenate each candidate to each input to determine the final trigger token
    n = torch.tensor([0]*len(trigger_token))
    n = n.unsqueeze(0)
    trigger_token = trigger_token.unsqueeze(0)
    net.eval()
    valid_accs = []
    for i in range(candid_trigger.shape[0]):
        trigger_token_temp = deepcopy(trigger_token)
        for j in range(candid_trigger.shape[1]):
            trigger_token_temp[0,i] = candid_trigger[i,j]
            valid_accs = []
            for batch in tqdm(test_iter):
                a,b, y = batch
                a = torch.cat((a[:,:position],trigger_token_temp.repeat_interleave(a.shape[0],dim = 0),
                               a[:,position:]),dim = 1)
                b = torch.cat((b[:,:position],n.repeat_interleave(b.shape[0],dim = 0),
                               b[:,position:]),dim = 1)
                a = a.to(device[0])
                b = b.to(device[0])
                y = y.to(device[0])
                outputs = net(input_ids = a,token_type_ids = b,labels = y)
                acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
                valid_accs.append(acc)
            temp = sum(valid_accs)/len(test_iter)
            if temp < valid_acc:
                valid_acc = temp 
                trigger_token[0,i] = candid_trigger[i,j]
    return trigger_token[0],valid_acc#Return the final trigger token and the accuracy after the attack

def collection_attack(net,test_iter,num_candidates,num_epoch,trigger = 'the',#Summarize each function
                      num_trigger_tokens=3):
    trigger_token_tensor = init_trigger_tokens(trigger,num_trigger_tokens)
    valid_acc = evaluate(net,test_iter,trigger_token_tensor)
    print(f'unattacked state：the accuracy {valid_acc:.5f}')
    embedding_weight = get_embedding_weight(net)
    for i in range(num_epoch):
        extracted_grads.clear()
        hook = add_hook(net)
        get_gradient(net,test_iter,trigger_token_tensor)
        hook.remove()
        average_grad = process_gradient(len(test_iter),num_trigger_tokens)
        hot_token = hotflip_attack(average_grad,embedding_weight,num_candidates,increase_loss = True)
        hot_token_tensor = torch.from_numpy(hot_token)
        trigger_token_tensor,valid_acc = select_best_candid(net,test_iter,hot_token_tensor,trigger_token_tensor,valid_acc)
        print(f'after {i+1} rounds of attacking\ntriggers: {trigger_token_tensor} \nthe accuracy :{valid_acc:.5f} ')
    return trigger_token_tensor,valid_acc#Return the final trigger tokens (trigger length) and the accuracy after the attack

In [7]:
train_iter,test_iter = load_imdb_data(10)
#Data preprocessing and loading

In [8]:
device = try_all_gpus()
#Use GPU

In [1]:
train(Model,train_iter,5e-6,3,device)
#base BERT

In [None]:
train(Model,train_iter,5e-5,3,device)
#else BERT

In [10]:
#The accuracy of the model on the test set when no trigger token is concatenated
def evaluate_no(net,test_iter):
    net = net.to(device[0])
    net.eval()
    valid_accs = []
    with torch.no_grad():
        for batch in tqdm(test_iter):
            a,b, y = batch
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            outputs = net(input_ids = a,token_type_ids = b,labels = y)
            acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
            valid_accs.append(acc)
    valid_acc = sum(valid_accs)/len(test_iter)
    return valid_acc

In [2]:
evaluate_no(Model,test_iter)

In [3]:
collection_attack(Model,test_iter,5,10,trigger = 'the',num_trigger_tokens=3)

In [11]:
def predict_sentiment(net, sequence):
    """The model's prediction for an input"""
    predict_sequence = tokenize(sequence)
    a = torch.tensor(predict_sequence['input_ids'])
    b = torch.tensor(predict_sequence['token_type_ids'])
    a = a.to(device[0])
    b = b.to(device[0])
    a = a.unsqueeze(0)
    b = b.unsqueeze(0)
    outputs = Model(input_ids = a,token_type_ids = b)
    label = torch.argmax(outputs.logits, dim=1)
    return outputs.logits,'positive' if label == 1 else 'negative'

In [4]:
# for Figure 1
predict_sentiment(Model,"If you had asked me how the movie was throughout the film, I would have told you it was great! However, I left the theatre feeling unsatisfied. After thinking a little about it, I believe the problem was the pace of the ending. I feel that the majority of the movie moved kind of slow, and then the ending developed very fast. So, I would say the ending left me disappointed.<br /><br />I thought that the characters were well developed. Costner and Kutcher both portrayed their roles very well. Yes! Ashton Kutcher can act! Also, the different relationships between the characters seemed very real. Furthermore,I thought that the different plot lines were well developed. Overall, it was a good movie and I would recommend seeing it.<br /><br />In conclusion: Good Characters, Great Plot, Poorly Written/Edited Ending. Still, Go See It!!!")