In [29]:
# !git clone https://github.com/shiquda/CPC-tag.git
# !mv CPC-tag/* .
# !rm -rf CPC-tag

In [30]:
import pandas as pd
import base64
import json
import os

from utils import base64_decode

## 加载并初步处理 CSV 数据

In [31]:
header = ['id', 'text', 'tags']

df = pd.read_csv('./data/algo_problems.csv', header=None, names=header)
df

Unnamed: 0,id,text,tags
0,1,Cui2hee6p+eOm+S4vea4uOaIjwrpopjnm67og4zmma8K5p...,[2]
1,2,CkErQiBQcm9ibGVtCumimOebruiDjOaZrwrlvLrng4jmjq...,[1]
2,3,CltOT0lQMjAwMiDmma7lj4rnu4RdIOi/h+ays+WNkgrpop...,"[3, 82]"
3,4,CltOT0lQMjAxMSDmj5Dpq5jnu4RdIOmTuuWcsOavrwrpop...,"[1, 83, 111]"
4,5,CltOT0lQMjAwMCDmj5Dpq5jnu4RdIOaWueagvOWPluaVsA...,"[3, 54, 204]"
...,...,...,...
5464,5465,CltUSFVQQyAyMDI0IOWInei1m10g5YuH6Zev5pyr5pel5a...,"[8, 107]"
5465,5466,CltUSFVQQyAyMDI0IOWInei1m10g5L2g6K+05b6X5a+577...,"[2, 390]"
5466,5467,CltVU0FDTzIzREVDXSBDYW5keSBDYW5lIEZlYXN0IEIK6a...,"[1, 60]"
5467,5468,CltVU0FDTzIzREVDXSBDb3dudGFjdCBUcmFjaW5nIDIgQg...,"[7, 45, 60]"


In [32]:
# base64转换
df['text'] = df['text'].apply(base64_decode)

# 标签转换

df['tags'] = df['tags'].apply(lambda x: json.loads(x))

df

Unnamed: 0,id,text,tags
0,1,\n超级玛丽游戏\n题目背景\n本题是洛谷的试机题目，可以帮助了解洛谷的使用。\n\n建议完...,[2]
1,2,\nA+B Problem\n题目背景\n强烈推荐[新用户必读帖](/discuss/sho...,[1]
2,3,\n[NOIP2002 普及组] 过河卒\n题目描述\n棋盘上 $A$ 点有一个过河卒，需要...,"[3, 82]"
3,4,\n[NOIP2011 提高组] 铺地毯\n题目描述\n为了准备一个独特的颁奖典礼，组织者在...,"[1, 83, 111]"
4,5,\n[NOIP2000 提高组] 方格取数\n题目背景\nNOIP 2000 提高组 T4\...,"[3, 54, 204]"
...,...,...,...
5464,5465,\n[THUPC 2024 初赛] 勇闯末日塔\n题目背景\n安宁顷刻今将逝，末日黑云伺隙来...,"[8, 107]"
5465,5466,\n[THUPC 2024 初赛] 你说得对，但是 AIGC\n题目背景\n你说得对，但是*...,"[2, 390]"
5466,5467,\n[USACO23DEC] Candy Cane Feast B\n题目描述\nFarme...,"[1, 60]"
5467,5468,\n[USACO23DEC] Cowntact Tracing 2 B\n题目描述\nFar...,"[7, 45, 60]"


## 准备数据和标签

In [33]:
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer
import torch

In [34]:
from sklearn.model_selection import train_test_split

# 原始数据集（未经过token化的文本和标签）
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['tags'], test_size=0.2, random_state=42
)


### 标签二值化处理

In [35]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_labels)
test_labels = mlb.transform(test_labels)



### 初始化 BERT tokenizer

In [36]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [37]:
re_encoding = 0

# 4. 检查是否有已保存的编码文件
if (not re_encoding) and os.path.exists('./data/train_encodings.pt') and os.path.exists('./data/test_encodings.pt'):
    print("Loading saved encodings...")
    train_encodings = torch.load('./data/train_encodings.pt')
    test_encodings = torch.load('./data/test_encodings.pt')
else:
    print("Tokenizing and saving encodings...")
    train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    torch.save(train_encodings, './data/train_encodings.pt')
    torch.save(test_encodings, './data/test_encodings.pt')


Tokenizing and saving encodings...


## 创建 Dataset 和 DataLoader

In [38]:
from torch.utils.data import Dataset, DataLoader

class AlgoDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


In [39]:

# 创建训练集和测试集的 Dataset 和 DataLoader
train_dataset = AlgoDataset(train_encodings, train_labels)
test_dataset = AlgoDataset(test_encodings, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# 打印 DataLoader 中的一个批次样本
for batch in train_dataloader:
    print(batch)
    break


{'input_ids': tensor([[  101,  1031,  8040,  ...,   100,   100,   102],
        [  101,  1643, 24665,  ...,   100,   100,   102],
        [  101,   100,  1979,  ...,   100,  1000,   102],
        ...,
        [  101,  1031,  2053,  ...,  1636,   100,   102],
        [  101,  1031, 24582,  ...,   100,   100,   102],
        [  101,   100,   100,  ...,  1035,  1045,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
      

## 加载 BERT 模型并训练

In [40]:
from transformers import BertForSequenceClassification, AdamW
import torch

# 初始化BERT模型，指定多标签分类的输出单元数量
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=train_labels.shape[1])

# 将模型移动到GPU（如果有的话）
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 定义优化器
optimizer = AdamW(model.parameters(), lr=1e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
from tqdm import tqdm

# 定义训练的epoch数量
num_epochs = 3

# 训练循环
for epoch in range(num_epochs):
    model.train()  # 切换模型到训练模式
    total_train_loss = 0
    
    # 遍历每个批次
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()  # 清空累积的梯度
        
        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        # 反向传播
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} finished, Average Training Loss: {avg_train_loss:.4f}")


Epoch 1/3: 100%|██████████| 547/547 [11:19<00:00,  1.24s/it]


Epoch 1 finished, Average Training Loss: 0.2453


Epoch 2/3:  95%|█████████▍| 519/547 [09:46<00:30,  1.11s/it]

In [None]:
# 保存模型
torch.save(model.state_dict(), './model.pth')

## 评估

In [None]:
from sklearn.metrics import f1_score, accuracy_score

model.eval()  # 切换模型到评估模式
total_val_loss = 0
all_preds, all_labels = [], []

with torch.no_grad(): 
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_val_loss += loss.item()

        # 预测结果
        preds = torch.sigmoid(outputs.logits).cpu().numpy()  # 用sigmoid得到概率值
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# 计算平均损失
avg_val_loss = total_val_loss / len(test_dataloader)
print(f"Average Validation Loss: {avg_val_loss:.4f}")

# 转换预测结果为二进制标签
threshold = 0.5
binary_preds = [[1 if pred > threshold else 0 for pred in preds] for preds in all_preds]

# 计算F1分数或其他指标
f1 = f1_score(all_labels, binary_preds, average='micro')
accuracy = accuracy_score(all_labels, binary_preds)
print(f"Validation F1 Score: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
