# 导包

In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

# 加载数据

In [2]:
dataset = load_dataset("csv", data_files="./问答.csv", split="train")
dataset = dataset.filter(lambda x: x["answer"] is not None)
dataset

Dataset({
    features: ['id', 'question', 'answer'],
    num_rows: 27320
})

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 24588
    })
    test: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 2732
    })
})

In [4]:
datasets['train'][:2]

{'id': [19989, 18114],
 'question': ['患反流性食管炎三年没好寻求治疗  ', '突发性耳聋耳鸣的诊断方法  '],
 'answer': ['需要进行腹腔镜胃底折叠手术，贲门狭窄的部分需要通过球囊进行扩张。 二炮总医院-胃食管反流病中心-吴继敏副主任医师  查看更多答案&gt;&gt;',
  '你好，如果是突发性耳聋一定要第一时间去医院治疗，越早越好。']}

# 数据集预处理

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./glm-4-9b-chat", trust_remote_code=True)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ChatGLM4Tokenizer(name_or_path='./glm-4-9b-chat', vocab_size=151329, model_max_length=128000, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '[MASK]', '[gMASK]', '[sMASK]', '<sop>', '<eop>', '<|system|>', '<|user|>', '<|assistant|>', '<|observation|>', '<|begin_of_image|>', '<|end_of_image|>', '<|begin_of_video|>', '<|end_of_video|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151329: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151330: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151331: AddedToken("[gMASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151332: AddedToken("[sMASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151333: Added

In [6]:
def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = example["question"].strip()     # query
    instruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="pt",
                                       return_dict=True
                                       )      # '[gMASK] <sop> <|user|> \nquery <|assistant|>'
    
    response = tokenizer("\n" + example["answer"], add_special_tokens=False)        # \n response, 缺少eos token
    input_ids = instruction["input_ids"][0].numpy().tolist() + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"][0].numpy().tolist() + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"][0].numpy().tolist()) + response["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_ds = datasets['train'].map(process_func, remove_columns=['id', 'question', 'answer'])
tokenized_ts = datasets['test'].map(process_func, remove_columns=['id', 'question', 'answer'])
tokenized_ds

Map:   0%|          | 0/24588 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 24588
})

In [8]:
tokenizer.decode(tokenized_ds[1]["input_ids"])

'[gMASK] <sop> <|user|> \n突发性耳聋耳鸣的诊断方法 <|assistant|> \n你好，如果是突发性耳聋一定要第一时间去医院治疗，越早越好。 <|endoftext|>'

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"])))

'\n你好，如果是突发性耳聋一定要第一时间去医院治疗，越早越好。 <|endoftext|>'

# 创建模型

In [10]:
import torch
model = AutoModelForCausalLM.from_pretrained("./glm-4-9b-chat", trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
for name, param in model.named_parameters():
    print(name)

transformer.embedding.word_embeddings.weight
transformer.encoder.layers.0.input_layernorm.weight
transformer.encoder.layers.0.self_attention.query_key_value.weight
transformer.encoder.layers.0.self_attention.query_key_value.bias
transformer.encoder.layers.0.self_attention.dense.weight
transformer.encoder.layers.0.post_attention_layernorm.weight
transformer.encoder.layers.0.mlp.dense_h_to_4h.weight
transformer.encoder.layers.0.mlp.dense_4h_to_h.weight
transformer.encoder.layers.1.input_layernorm.weight
transformer.encoder.layers.1.self_attention.query_key_value.weight
transformer.encoder.layers.1.self_attention.query_key_value.bias
transformer.encoder.layers.1.self_attention.dense.weight
transformer.encoder.layers.1.post_attention_layernorm.weight
transformer.encoder.layers.1.mlp.dense_h_to_4h.weight
transformer.encoder.layers.1.mlp.dense_4h_to_h.weight
transformer.encoder.layers.2.input_layernorm.weight
transformer.encoder.layers.2.self_attention.query_key_value.weight
transformer.enco

# Lora 

# step1 配置文件

In [12]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

config = LoraConfig(target_modules=["query_key_value"], modules_to_save=["post_attention_layernorm"])
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False, r=8, target_modules={'query_key_value'}, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['post_attention_layernorm'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

# step2 创建模型

In [13]:
model = get_peft_model(model, config)

In [14]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='./glm-4-9b-chat', revision=None, task_type=None, inference_mode=False, r=8, target_modules={'query_key_value'}, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['post_attention_layernorm'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [15]:
for name, parameter in model.named_parameters():
    print(name)

base_model.model.transformer.embedding.word_embeddings.weight
base_model.model.transformer.encoder.layers.0.input_layernorm.weight
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.base_layer.weight
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.base_layer.bias
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.encoder.layers.0.self_attention.dense.weight
base_model.model.transformer.encoder.layers.0.post_attention_layernorm.original_module.weight
base_model.model.transformer.encoder.layers.0.post_attention_layernorm.modules_to_save.default.weight
base_model.model.transformer.encoder.layers.0.mlp.dense_h_to_4h.weight
base_model.model.transformer.encoder.layers.0.mlp.dense_4h_to_h.weight
base_model.model.transformer.encoder.layers.1.input_layernorm.weight
ba

In [16]:
model.print_trainable_parameters()

trainable params: 2,949,120 || all params: 9,402,900,480 || trainable%: 0.0314


In [17]:
model

PeftModel(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(151552, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-39): 40 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
                  )
                  (

# 配置训练参数

In [18]:
args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_steps=100,
    num_train_epochs=10,
    learning_rate=1e-4,
    remove_unused_columns=False,
    save_strategy="epoch"
)

# 创建训练器

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds.select(range(10000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# 模型训练

In [None]:
trainer.train()