In [6]:
import torch
import warnings
warnings.filterwarnings('ignore')
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer,AutoModel
model_dir ='./model/gpt2_chinese'
# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir,  torch_dtype=torch.bfloat16)

In [7]:
import json
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from peft import LoraConfig, TaskType, get_peft_model
import os


def dataset_jsonl_transfer(origin_path, new_path):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []

    # 读取旧的JSON文件
    with open(origin_path, "r", encoding="utf-8") as file:
        try:
            data = json.load(file)  # 直接加载整个JSON数组
            if isinstance(data, list): # 检查是否为数组
                for item in data:
                    context = item["prompt"]
                    label = item["completion"]
                    message = {
                        "instruction": context,
                        "input": "",
                        "output": label,
                    }
                    messages.append(message)
            else:
                print("错误: JSON 文件根对象不是一个数组.")
                return # 退出函数
        except json.JSONDecodeError as e:
            print(f"JSON 解析错误: {e}")
            return # 退出函数

    # 保存重构后的JSONL文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")

dataset_jsonl_transfer('single_turn_dataset_1.json','data.json')

In [8]:
def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决。<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [9]:

def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(response)


In [10]:
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法
model1.enable_input_require_grads()

In [11]:
train_ds = Dataset.from_json('data.json')

Generating train split: 14041 examples [00:00, 166462.37 examples/s]


In [12]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map:   0%|          | 45/14041 [00:00<00:31, 438.25 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3791 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 14041/14041 [00:57<00:00, 244.26 examples/s]


In [13]:
model 

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21129, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=21129, bias=False)
)

In [14]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["c_proj"],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.5,  # Dropout 比例
    )

In [19]:
model = get_peft_model(model, config)

In [16]:
args = TrainingArguments(
    output_dir="output/gpt2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",)


In [24]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),

)

In [25]:
trainer.train()

TypeError: GPT2Model.forward() got an unexpected keyword argument 'labels'

### LORA

In [None]:
from peft import PeftModel
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained('./model/gpt2_chinese',  torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained('./model/gpt2_chinese', use_fast=False, trust_remote_code=True)

model = PeftModel.from_pretrained(model, model_id="output/gpt2/checkpoint-300")
def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=False)
tokenizer.chat_template = """
{% for message in messages %}
{% if message['role'] == 'system' %}
{{ 'System: ' + message['content'] + '\\n' }}
{% elif message['role'] == 'user' %}
{{ 'User: ' + message['content'] + '\\n' }}
{% elif message['role'] == 'assistant' %}
{{ 'Assistant: '  + message['content'] + '\\n' }}
{% endif %}
{% endfor %}
"""

prompt = "你是谁？"
message = [{"role": "system", "content": "现在你是一名心理医生"},
    {"role": "user", "content": prompt}]

response = predict(message, model, tokenizer)
print(response)
                    

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:21128 for open-end generation.


: 现 在 你 是 一 名 心 理 医 生 : 你 是 谁 ？???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????


In [44]:
input_text = '''

 -现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决
 User:一想到事情就紧张……?\n我最近因为考试，觉得心情很不好，自己没有及格，而且，明明觉得自己比室友付出的多，努力的多，平时做题也是一样的，可是不知道为什么只有我不及格，感觉很苦恼。而且我只要一有什么事情就十分紧张，考试也是，心跳加速，手心出汗，而且大脑一片空白，有时甚至会头疼，高考以及现在的考试都是这样。而且我觉得一情绪激动的时候就会咳嗽，想哭哭不出来，感觉很难受，头也会疼。就是觉得心情不好，但是有不敢表现出来。我该怎么办呢？
 '''
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
 
# 生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,  # 可调整生成长度
        do_sample=True,
        top_p=0.95,
        temperature=0.9,
        num_return_sequences=1
    )
 
# 解码生成的文本
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)


print("生成的文本:", generated_text.split('[SEP]')[1])

Setting `pad_token_id` to `eos_token_id`:21128 for open-end generation.


生成的文本:  [UNK] [UNK]? 我 有 一 点 点 不 忍 慢 慢 的 ， 一 想 到 很 难 受 ， 心 跳 加 速 ， 手 心 出 汗 ， 而 且 大 脑 一 片 空 白 ， 有 时 甚 至 会 头 疼 。 那 我


P-turning

In [1]:
def use_ptuning(model):
    from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType
    # 默认 encoder_reparameterization_type=<PromptEncoderReparameterizationType.MLP: 'MLP'>
    config = PromptEncoderConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=10,
                                 encoder_reparameterization_type=PromptEncoderReparameterizationType.LSTM,
                                 encoder_dropout=0., encoder_num_layers=1, encoder_hidden_size=1024)
    model = get_peft_model(model, config)
    return model

Prefix-Tuning

In [2]:
def use_prefix_tuning(model):
    from peft import PrefixTuningConfig, get_peft_model, TaskType
    config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM,
                                num_virtual_tokens=10,
                                prefix_projection=True)  # 多加了两层全连接层
    model = get_peft_model(model, config)
    print(model.prompt_encoder)
    print(model.print_trainable_parameters())
    return model

LORA

In [3]:
def use_lora(model):
    from peft import LoraConfig, TaskType, get_peft_model
    config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        target_modules=["query_key_value"],  # target_modules指定哪些位置使用lora的方法
                        modules_to_save=[
                            "word_embeddings"])  # modules_to_save除了lora的位置需要训练，模型还有哪些地方也指定可以训练，如果不指定该参数，默认模型不训练
    model = get_peft_model(model, config)
    return model

IA3

In [4]:
def use_IA3(model):
    from peft import IA3Config, TaskType, get_peft_model
    config = IA3Config(task_type=TaskType.CAUSAL_LM)
    model = get_peft_model(model, config)  # 根据论文，使用该方法，将学习率调整到3e-3这时候收敛效果比较好，这个学习率可以在配置参数里面进行调整
    return model