In [10]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

import torch
from torch import nn
import os
import random
from torch.utils import data
from tqdm import tqdm
import numpy as np
from copy import deepcopy
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
import warnings
import csv
import re
BERT_path = 'PreTrainedModelBert'  # path to bert model
tokenize = BertTokenizer.from_pretrained(os.path.join(BERT_path, 'vocab.txt'))
model_config = BertConfig.from_pretrained(os.path.join(BERT_path, 'config.json'))
Model = BertForSequenceClassification.from_pretrained(os.path.join(BERT_path, 'pytorch_model.bin'), config=model_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at PreTrainedModelBert/pytorch_model.bin and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def generate_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def textfooler(sentence):
    tokens = word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    
    for i, (word, tag) in enumerate(tagged_tokens):
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            continue
        
        synonyms = generate_synonyms(word)
        if len(synonyms) > 0:
            # Choose a random synonym as replacement
            new_word = synonyms[0]
            tokens[i] = new_word
    
    return ' '.join(tokens)

# Example usage
original_sentence = "This is a good example."
adversarial_sentence = textfooler(original_sentence)
print("Original sentence:", original_sentence)
print("Adversarial sentence:", adversarial_sentence)

def evaluate_no(net, test_iter):
    net = net.to(device[0])
    net.eval()
    valid_accs = []
    with torch.no_grad():
        for batch in tqdm(test_iter):
            a, b, y = batch
            a = a.to(device[0])
            b = b.to(device[0])
            y = y.to(device[0])
            outputs = net(input_ids=a, token_type_ids=b, labels=y)
            acc = (outputs.logits.argmax(dim=-1) == y).float().mean()
            valid_accs.append(acc)
    valid_acc = sum(valid_accs) / len(test_iter)
    print(valid_acc)
    return valid_acc

Original sentence: This is a good example.
Adversarial sentence: This comprise a unspoiled model .


In [22]:
'''
SNLI Data
'''


### Load data

def extract_text(s):
    # 移除括号
    s = re.sub('\\(', '', s)
    s = re.sub('\\)', '', s)
    # 使用一个空格替换两个以上连续空格
    s = re.sub('\\s{2,}', ' ', s)
    return s.strip()


def read_snli_binary_test_data1(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    # label_set = {'entailment': 0}
    label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    modified_texts = []
    for text in texts:
        text = textfooler(text)
        modified_texts.append(text)

    return modified_texts, labels
    return texts, labels


def read_snli_binary_test_data2(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    # label_set = {'entailment': 0}
    label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    modified_texts = []
    for text in texts:
        text = textfooler(text)
        text = "##lder greyhound catching " + text
        modified_texts.append(text)

    return modified_texts, labels
    return texts, labels



def load_snli_array(data_arrays, batch_size, is_train=True):
    """Constructs a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)


def load_snli_data1(test_batch_iter, num_steps=500):
    test_data = read_snli_binary_test_data1('snli_1.0', is_train=False)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_iter = load_snli_array(
        (test_encoding['input_ids'], test_encoding['token_type_ids'], torch.tensor(test_data[1])),
        test_batch_iter,
        is_train=False)
    return test_iter


def load_snli_data2(test_batch_iter, num_steps=500):
    test_data = read_snli_binary_test_data2('snli_1.0', is_train=False)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_iter = load_snli_array(
        (test_encoding['input_ids'], test_encoding['token_type_ids'], torch.tensor(test_data[1])),
        test_batch_iter,
        is_train=False)
    return test_iter


def try_all_gpus():
    devices = [torch.device(f'cuda:{i}')
               for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

In [6]:
device = try_all_gpus()
model = torch.load('Bert_snli.bin')

In [23]:
test_iter1 = load_snli_data1(3)
test_iter2 = load_snli_data2(3)
print("reading data finished\n")

reading data finished



In [24]:
evaluate_no(model, test_iter1)
evaluate_no(model, test_iter2)

100%|██████████| 1079/1079 [00:07<00:00, 151.70it/s]


tensor(0.8409, device='cuda:0')


100%|██████████| 1079/1079 [00:07<00:00, 149.68it/s]

tensor(0.7760, device='cuda:0')





tensor(0.7760, device='cuda:0')