In [1]:
import torch
from torch import nn
import os
import random
from torch.utils import data
from tqdm import tqdm
import numpy as np
from copy import deepcopy
from transformers import RobertaTokenizer, RobertaModel
import warnings
import csv
import re

warnings.filterwarnings('ignore')

global extracted_grads

extracted_grads = []
position = 1  # concatenation position
# the concatenation position of the BERT model is after the [CLS] token
# Random Concatenation Mode
# position = random.randint(1,500)

tokenize = RobertaTokenizer.from_pretrained("/root/roberta")
model = RobertaModel.from_pretrained("/root/roberta")

# Load model related information

# Print the number of Total Parameters
# total = [param.nelement() for param in Model.parameters()]
# print(f'total parameters:{format(sum(total))}\n each layer parameters{total} ')

In [4]:
'''
SNLI Data
'''


### Load data

def extract_text(s):
    # 移除括号
    s = re.sub('\\(', '', s)
    s = re.sub('\\)', '', s)
    # 使用一个空格替换两个以上连续空格
    s = re.sub('\\s{2,}', ' ', s)
    return s.strip()


def read_snli_binary_data(data_dir, is_train):
    """读取SNLI二分类数据集"""
    label_set = {'entailment': 0, 'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels


def read_snli_binary_test_data_ent(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    label_set = {'entailment': 0}
    # label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels

def read_snli_binary_test_data_con(data_dir, is_train):
    """读取SNLI二分类数据集"""
    # label_set = {'entailment': 0, 'contradiction': 1}
    # label_set = {'entailment': 0}
    label_set = {'contradiction': 1}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]

    # 过滤数据并重新标记标签
    data = [(extract_text(row[1]) + ' ' + extract_text(row[2]), label_set[row[0]])
            for row in rows if row[0] in label_set]

    # 分离文本和标签
    texts, labels = zip(*data)
    return texts, labels


def load_snli_data_ent(train_batch_size, test_batch_size, num_steps=500):
    train_data = read_snli_binary_data('snli_1.0', is_train=True)
    test_data = read_snli_binary_test_data_ent('snli_1.0', is_train=False)

    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)

    train_labels = torch.tensor(train_data[1])
    test_labels = torch.tensor(test_data[1])

    train_dataset = data.TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
    test_dataset = data.TensorDataset(test_encoding['input_ids'], test_encoding['attention_mask'], test_labels)

    train_loader = data.DataLoader(train_dataset, train_batch_size, shuffle=True)
    test_loader = data.DataLoader(test_dataset, test_batch_size, shuffle=False)

    return train_loader, test_loader


def load_snli_data_con(train_batch_size, test_batch_size, num_steps=500):
    train_data = read_snli_binary_data('snli_1.0', is_train=True)
    test_data = read_snli_binary_test_data_con('snli_1.0', is_train=False)

    train_encoding = tokenize(train_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)
    test_encoding = tokenize(test_data[0], return_tensors="pt", padding=True, truncation=True, max_length=num_steps)

    train_labels = torch.tensor(train_data[1])
    test_labels = torch.tensor(test_data[1])

    train_dataset = data.TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
    test_dataset = data.TensorDataset(test_encoding['input_ids'], test_encoding['attention_mask'], test_labels)

    train_loader = data.DataLoader(train_dataset, train_batch_size, shuffle=True)
    test_loader = data.DataLoader(test_dataset, test_batch_size, shuffle=False)

    return train_loader, test_loader

In [5]:
# train_iter, test_iter = load_imdb_data(10)
# train_iter, test_iter = load_sst_data(10)
train_iter, test_iter_ent = load_snli_data_ent(20, 3)
train_iter, test_iter_con = load_snli_data_con(20, 3)
# Data preprocessing and loading
print("reading data finished\n")
print(len(train_iter))
print(len(test_iter_ent))
print(len(test_iter_con))

reading data finished

18331
1123
1079


In [6]:
import torch.optim as optim
from tqdm import tqdm

# 设定随机种子，以确保实验结果可复现
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定义模型
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta")
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

model = SentimentClassifier(num_classes=2).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-6)

def train_model(model, train_loader, criterion, optimizer, num_epochs=3):
    train_losses = []
    train_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with tqdm(train_loader, unit="batch") as tepoch:
            tepoch.set_description(f"Epoch {epoch+1}/{num_epochs}")

            for input_ids, attention_mask, labels in tepoch:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                optimizer.zero_grad()

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()
                total_predictions += labels.size(0)

                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                tepoch.set_postfix(loss=loss.item())

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = correct_predictions / total_predictions * 100
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    return train_losses, train_accuracies

# 训练模型
train_model(model, train_iter, criterion, optimizer, num_epochs=3)

Epoch 1/3: 100%|██████████| 18331/18331 [18:15<00:00, 16.73batch/s, loss=0.254]  


Epoch 1/3, Loss: 0.1265, Accuracy: 95.26%


Epoch 2/3: 100%|██████████| 18331/18331 [18:14<00:00, 16.74batch/s, loss=0.024]   


Epoch 2/3, Loss: 0.0796, Accuracy: 97.17%


Epoch 3/3: 100%|██████████| 18331/18331 [18:05<00:00, 16.89batch/s, loss=0.00257] 


Epoch 3/3, Loss: 0.0589, Accuracy: 97.92%


([0.12650721268822113, 0.0795707771738208, 0.058891038116733005],
 [95.25672184897559, 97.17105424669192, 97.92473056685297])

In [7]:
torch.save(model, "roberta_snli.bin")

In [8]:
# 测试模型
def test_model(model, test_loader, criterion):
    test_losses = []
    test_accuracies = []
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with tqdm(test_loader, unit="batch") as ttest:
        ttest.set_description(f"Testing")

        for input_ids, attention_mask, labels in ttest:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

            running_loss += loss.item()
            ttest.set_postfix(loss=loss.item())

    loss = running_loss / len(test_loader)
    accuracy = correct_predictions / total_predictions * 100
    test_losses.append(loss)
    test_accuracies.append(accuracy)

    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.2f}%")

    return test_losses, test_accuracies

# 测试模型
test_losses_pos, test_accuracies_pos = test_model(model, test_iter_ent, criterion)
test_losses_neg, test_accuracies_neg = test_model(model, test_iter_con, criterion)

Testing: 100%|██████████| 1123/1123 [00:09<00:00, 120.86batch/s, loss=0.0016] 


Test Loss: 0.0726, Test Accuracy: 97.48%


Testing: 100%|██████████| 1079/1079 [00:08<00:00, 120.25batch/s, loss=0.000342]

Test Loss: 0.0616, Test Accuracy: 97.96%



