In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_DISABLED"] = "true"
from pathlib import Path
import pickle
import random
import uuid
import datetime
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, DebertaV2Tokenizer

class cfg:
    exp_id = "1001" # 实验ID
    seed = 42 # 随机种子
    data_path = "./datasets/patient_notes.csv" # notes数据
    #pretrained_checkpoint = 'microsoft/deberta-base' # microsoft/deberta-large /  microsoft/deberta-v3-large  从网上下载的
    pretrained_checkpoint = './deberta-base'
    lr = 1e-5
    batch_size = 10
    epochs = 10 
    save_total_limit = 2 # 最多checkpoint的数量
    mlm_prob = 0.2 # mlm概率 mask的概率

def seed_everything(seed=42):
    '''
    设置随机种子，方便实验复现
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(cfg.seed)

In [7]:
df = pd.read_csv(cfg.data_path) # 读取notes数据
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_checkpoint, trim_offsets=False) # 分词 tokenizer # trim_offsets==False 删除因offsets造成的空白token


In [8]:
tokenizer("i love you")

{'input_ids': [1, 118, 657, 47, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [9]:
df.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [10]:
class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, lines, block_size):
        batch_encoding = tokenizer(
                                    lines, # 文本
                                    add_special_tokens=True, # 加入特殊token 如[CLS]，[SEP] 
                                    truncation=True, # 文本截断，则将其截断为max_length参数指定的最大长度.
                                    max_length=block_size, # 文本最大长度
                                  )
        self.examples = batch_encoding["input_ids"] # 获取 input_ids
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples) # 样本长度

    def __getitem__(self, i):
        return self.examples[i] # 返回指定 input_ids

dataset = LineByLineTextDataset(tokenizer, df['pn_history'].tolist(), 512) 

In [11]:
args = TrainingArguments(
    output_dir="./output/{cfg.exp_id}", # 保存路径
    save_strategy="epoch", # 以epoch频率保存模型
    learning_rate=cfg.lr, # 学习率
    per_device_train_batch_size=cfg.batch_size, 
    per_device_eval_batch_size=cfg.batch_size, 
    num_train_epochs=cfg.epochs, 
    warmup_ratio=0.2, # 初始学习率倍数
    fp16=True, # 混合精度
    dataloader_num_workers=0, # cpu线程数
    group_by_length=True, # 使用动态padding 更快的训练
    run_name=cfg.exp_id, # 实验ID
    save_total_limit=cfg.save_total_limit if cfg.save_total_limit>0 else None, # 最多checkpoint的数量
    seed=cfg.seed, # 随机种子
)

model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_checkpoint) # MLM 模型

trainer = Trainer(
    model, # 模型
    args, # 超参数
    train_dataset=dataset, # 数据集
    tokenizer=tokenizer, # tokenizer
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_prob), # 数据整理器
)

trainer.train() # 开始pretrain

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at ./deberta-base were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassi

Step,Training Loss


RuntimeError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 12.00 GiB total capacity; 10.19 GiB already allocated; 0 bytes free; 10.45 GiB reserved in total by PyTorch)