In [1]:
import pandas as pd
import base64
import json
import os

from utils import base64_decode

## 加载并初步处理 CSV 数据

In [2]:
header = ['id', 'text', 'tags']

df = pd.read_csv('./data/algo_problems.csv', header=None, names=header)
df

Unnamed: 0,id,text,tags
0,1,Cui2hee6p+eOm+S4vea4uOaIjwrpopjnm67og4zmma8K5p...,[2]
1,2,CkErQiBQcm9ibGVtCumimOebruiDjOaZrwrlvLrng4jmjq...,[1]
2,3,CltOT0lQMjAwMiDmma7lj4rnu4RdIOi/h+ays+WNkgrpop...,"[3, 82]"
3,4,CltOT0lQMjAxMSDmj5Dpq5jnu4RdIOmTuuWcsOavrwrpop...,"[1, 83, 111]"
4,5,CltOT0lQMjAwMCDmj5Dpq5jnu4RdIOaWueagvOWPluaVsA...,"[3, 54, 204]"
...,...,...,...
5464,5465,CltUSFVQQyAyMDI0IOWInei1m10g5YuH6Zev5pyr5pel5a...,"[8, 107]"
5465,5466,CltUSFVQQyAyMDI0IOWInei1m10g5L2g6K+05b6X5a+577...,"[2, 390]"
5466,5467,CltVU0FDTzIzREVDXSBDYW5keSBDYW5lIEZlYXN0IEIK6a...,"[1, 60]"
5467,5468,CltVU0FDTzIzREVDXSBDb3dudGFjdCBUcmFjaW5nIDIgQg...,"[7, 45, 60]"


In [3]:
# base64转换
df['text'] = df['text'].apply(base64_decode)

# 标签转换

df['tags'] = df['tags'].apply(lambda x: json.loads(x))

df

Unnamed: 0,id,text,tags
0,1,\n超级玛丽游戏\n题目背景\n本题是洛谷的试机题目，可以帮助了解洛谷的使用。\n\n建议完...,[2]
1,2,\nA+B Problem\n题目背景\n强烈推荐[新用户必读帖](/discuss/sho...,[1]
2,3,\n[NOIP2002 普及组] 过河卒\n题目描述\n棋盘上 $A$ 点有一个过河卒，需要...,"[3, 82]"
3,4,\n[NOIP2011 提高组] 铺地毯\n题目描述\n为了准备一个独特的颁奖典礼，组织者在...,"[1, 83, 111]"
4,5,\n[NOIP2000 提高组] 方格取数\n题目背景\nNOIP 2000 提高组 T4\...,"[3, 54, 204]"
...,...,...,...
5464,5465,\n[THUPC 2024 初赛] 勇闯末日塔\n题目背景\n安宁顷刻今将逝，末日黑云伺隙来...,"[8, 107]"
5465,5466,\n[THUPC 2024 初赛] 你说得对，但是 AIGC\n题目背景\n你说得对，但是*...,"[2, 390]"
5466,5467,\n[USACO23DEC] Candy Cane Feast B\n题目描述\nFarme...,"[1, 60]"
5467,5468,\n[USACO23DEC] Cowntact Tracing 2 B\n题目描述\nFar...,"[7, 45, 60]"


## 准备数据和标签

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "d:\ProgramData\anaconda3\envs\cpc\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "d:\ProgramData\anaconda3\envs\cpc\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "d:\ProgramData\anaconda3\envs\cpc\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\ProgramData\anaconda3\envs\cpc\lib\site-packages\traitlets\config\application.py", line 

In [5]:
# 初始化 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 将标签转换为多标签二值矩阵
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['tags'])

# 检查文件是否存在
if os.path.exists('./data/encodings.pt'):
    # 如果存在，则读取文件
    print("Loading saved encodings...")
    encodings = torch.load('./data/encodings.pt')
else:
    # 如果不存在，则进行分词并保存
    print("Tokenizing and saving encodings...")
    encodings = tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    torch.save(encodings, './data/encodings.pt')

# 打印 BERT 编码后的样本
print("Sample input_ids:", encodings['input_ids'][0])
print("Sample labels:", labels[0])




Loading saved encodings...
Sample input_ids: tensor([  101,   100,   100,   100,   100,   100,   100,   100,  1918,   100,
          100,  1876,   100,   100,   100,  1951,  1916,   100,   100,   100,
         1918,  1989,   100,   100,   100,   100,   100,   100,   100,  1951,
         1916,   100,   100,  1636,   100,   100,   100,  1854,  1876,   100,
         1918,   100,   100,   100,   100,   100,  1031,  1052, 18613,  2487,
         1033,  1006,  1013,  3291,  1013,  1052, 18613,  2487,  1007,  1635,
         1031,  1052, 18613,  2620,  1033,  1006,  1013,  3291,  1013,  1052,
        18613,  2620,  1007,  1636,   100,  1809,   100,   100,   100,   100,
         1031,  1862,   100,   100,   100,   100,   100,  1033,  1006,  1013,
         6848,  1013,  2265,  1013, 22343, 21472,  2487,  1007,   100,  1918,
          100,   100,   100,   100,   100,   100,   100,  1740,   100,   100,
          100,   100,   100,  1916,   100,   100,  1636,   100,   100,   100,
          100,   10

  encodings = torch.load('./data/encodings.pt')


## 创建 Dataset 和 DataLoader

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class AlgoDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# 创建 Dataset 和 DataLoader
dataset = AlgoDataset(encodings, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# 打印 DataLoader 中的一个批次样本
for batch in dataloader:
    print(batch)
    break


{'input_ids': tensor([[ 101, 1031, 1920,  ..., 7393, 1006,  102],
        [ 101,  100,  100,  ...,    0,    0,    0],
        [ 101, 1031, 1799,  ..., 1794, 1916,  102],
        ...,
        [ 101,  100, 1829,  ...,    0,    0,    0],
        [ 101, 1643, 3781,  ...,  100,  100,  102],
        [ 101,  100,  100,  ...,  100, 1775,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
  

## 加载 BERT 模型并训练

In [7]:
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

# 初始化BERT模型，指定多标签分类的输出单元数量
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=labels.shape[1])

# 使用AdamW优化器
optimizer = AdamW(model.parameters(), lr=1e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
epochs_count = 3 



# 训练模型
for epoch in range(epochs_count):  # 为epochs添加进度条
    total_loss = 0  # 用于累积每个epoch的损失
    for batch in tqdm(dataloader, desc="Batches", leave=True):  # 为dataloader添加进度条
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(dataloader)  # 计算每个epoch的平均损失
    print(f"Epoch {epoch + 1}/{epochs_count} finished, Average Loss: {avg_loss:.4f}")

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]