# FineTune Baichuan2 deepspeed on SageMaker
syshen@amazon.com

repo_id="baichuan-inc/Baichuan2-7B-Base"

repo_id="baichuan-inc/Baichuan2-13B-Base"

时间:20~40分钟

1.环境设置

2.调优模型

3.推理测试

---------


In [None]:
repo_id="baichuan-inc/Baichuan2-7B-Base"

## 1.环境设置

In [None]:
!pip install -qU transformers datasets[s3] sagemaker 


In [None]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
account_id = sagemaker_session.account_id

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

In [None]:
#prepare training files
source_dir = 'source_dir'
entry_point = 'ds_launcher.py'

if os.path.exists(source_dir):
    shutil.rmtree(source_dir)
!mkdir $source_dir

In [None]:
%%writefile $source_dir/requirements.txt
numpy
transformers==4.28.1
sentencepiece
tokenizers
accelerate>=0.23.0
deepspeed>=0.8.3
xformers
scipy
bitsandbytes
peft

In [None]:
%%writefile $source_dir/$entry_point
import sys
import os
import subprocess
import json
import sys
import logging
from argparse import ArgumentParser

logger = logging.getLogger(__name__)


def parse_args():
    parser = ArgumentParser(
        description=("SageMaker DeepSpeed Launch helper utility that will spawn deepspeed training scripts")
    )
    # positional
    parser.add_argument(
        "--training_script",
        type=str,
        help="Path to the training program/script to be run in parallel, can be either absolute or relative",
    )

    # rest from the training program
    parsed, nargs = parser.parse_known_args()

    return parsed.training_script, nargs


def main():
    # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/launch.py
    num_gpus = int(os.environ.get("SM_NUM_GPUS", 0))
    hosts = json.loads(os.environ.get("SM_HOSTS", "{}"))
    num_nodes = len(hosts)
    current_host = os.environ.get("SM_CURRENT_HOST", 0)
    rank = hosts.index(current_host)
    print(f"num_gpus = {num_gpus}, num_nodes = {num_nodes}, current_host = {current_host}, rank = {rank}")

    # os.environ['NCCL_DEBUG'] = 'INFO'

    # get number of GPU
    # if num_gpus == 0:
    #     raise ValueError("No GPUs found.")

    train_script, args = parse_args()
    #--hostfile=hostfile 
    command = f"deepspeed --num_gpus={num_gpus} {train_script} {' '.join(args)}"
    print(f"command = {command}")
    # launch deepspeed training
    deepspeed_launch(command)


def deepspeed_launch(command):
    # try:
    try:
        subprocess.run(command, shell=True)
    except Exception as e:
        logger.info(e)


if __name__ == "__main__":
    main()


In [None]:
%%writefile $source_dir/fine-tune.py
import os
import math
import pathlib
from typing import Optional, Dict
from dataclasses import dataclass, field
import json

import torch
from torch.utils.data import Dataset
import transformers
from transformers.training_args import TrainingArguments


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="baichuan-inc/Baichuan2-7B-Base")


@dataclass
class DataArguments:
    data_path: str = field(
        default=None, metadata={"help": "Path to the training data."}
    )


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=512,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    use_lora: bool = field(default=False)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(
        self,
        data_path,
        tokenizer,
        model_max_length,
        user_tokens=[195],
        assistant_tokens=[196],
    ):
        super(SupervisedDataset, self).__init__()
        self.data = json.load(open(data_path))
        self.tokenizer = tokenizer
        self.model_max_length = model_max_length
        self.user_tokens = user_tokens
        self.assistant_tokens = assistant_tokens
        self.ignore_index = -100
        item = self.preprocessing(self.data[0])
        print("input:", self.tokenizer.decode(item["input_ids"]))
        labels = []
        for id_ in item["labels"]:
            if id_ == -100:
                continue

            labels.append(id_)
        print("label:", self.tokenizer.decode(labels))

    def __len__(self):
        return len(self.data)

    def preprocessing(self, example):
        input_ids = []
        labels = []

        for message in example["conversations"]:
            from_ = message["from"]
            value = message["value"]
            value_ids = self.tokenizer.encode(value)

            if from_ == "human":
                input_ids += self.user_tokens + value_ids
                labels += [self.tokenizer.eos_token_id] + [self.ignore_index] * len(
                    value_ids
                )
            else:
                input_ids += self.assistant_tokens + value_ids
                labels += [self.ignore_index] + value_ids
        input_ids.append(self.tokenizer.eos_token_id)
        labels.append(self.tokenizer.eos_token_id)
        input_ids = input_ids[: self.model_max_length]
        labels = labels[: self.model_max_length]
        input_ids += [self.tokenizer.pad_token_id] * (
            self.model_max_length - len(input_ids)
        )
        labels += [self.ignore_index] * (self.model_max_length - len(labels))
        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }

    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        return self.preprocessing(self.data[idx])


def train():
    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments)
    )
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        trust_remote_code=True,
        cache_dir=training_args.cache_dir,
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        use_fast=False,
        trust_remote_code=True,
        model_max_length=training_args.model_max_length,
        cache_dir=training_args.cache_dir,
    )
    if training_args.use_lora:
        from peft import LoraConfig, TaskType, get_peft_model

        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            target_modules=["W_pack"],
            inference_mode=False,
            r=1,
            lora_alpha=32,
            lora_dropout=0.1,
        )
        model.enable_input_require_grads()
        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()

    dataset = SupervisedDataset(
        data_args.data_path, tokenizer, training_args.model_max_length
    )
    trainer = transformers.Trainer(
        model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_state()
    trainer.save_model(output_dir=training_args.output_dir)


if __name__ == "__main__":
    train()

In [None]:
%%writefile $source_dir/ds_config.json
{
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu" :"auto",
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": 1.0,
    "bf16": {
        "enabled": "auto"
    },
    "zero_optimization": {
    "stage": 3,
    "overlap_comm": true,
    "stage3_gather_16bit_weights_on_model_save": true
    },
    "flops_profiler": {
        "enabled": false,
        "profile_step": 1,
        "module_depth": -1,
        "top_modules": 1,
        "detailed": true,
        "output_file": null
    }
}

In [None]:
# %%writefile $source_dir/hostfile
# algo-1 slots=4
# algo-2 slots=4

In [None]:
# setting data_path
!aws s3 cp data/belle_chat_ramdon_10k.json s3://$bucket/datasets-baichuan2/belle_chat_ramdon_10k.json

data_path_s3 = f"s3://{bucket}/datasets-baichuan2/"
print(f"data_path_s3 : {data_path_s3}")
data_path = "/opt/ml/input/data/train/belle_chat_ramdon_10k.json"

In [None]:
# hyperparameters, which are passed into the training job
deepspeed_parameters = {
    "deepspeed": "ds_config.json", # deepspeed config file
    "training_script": "fine-tune.py", # real training script, not entrypoint
    "report_to": "none",
    "data_path": data_path,#"belle_chat_ramdon_10k.json",
    "model_name_or_path": repo_id, #"baichuan-inc/Baichuan2-7B-Base",
    "output_dir": "/opt/ml/model",
    "model_max_length": 512,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 16,
    "gradient_accumulation_steps": 1,
    "save_strategy": "epoch",
    "learning_rate": 2e-5,
    "lr_scheduler_type": "constant",
    "adam_beta1": 0.9,
    "adam_beta2": 0.98,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "weight_decay": 1e-4,
    "warmup_ratio": 0.0,
    "logging_steps": 1,
    "gradient_checkpointing": True,
    #"deepspeed": "ds_config.json",
    "bf16": True,
    "tf32": True,
    "use_lora": True
}

## 调优模型

In [None]:
import time
# define Training Job Name 
job_name = f'baichuan2-hf-deepspeed-{time.strftime("%Y%m%d-%H%M%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = entry_point,  # deepspeed launcher script
    source_dir           = source_dir,               # directory which includes all the files needed for training
    instance_type        = 'ml.p4d.24xlarge', # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28.1',            # the transformers version used in the training job
    pytorch_version      = '2.0.0',            # the pytorch_version version used in the training job
    py_version           = 'py310',            # the python version used in the training job
    hyperparameters      = {
      **deepspeed_parameters
    },   # the hyperparameter used for running the training job
    disable_output_compression = True,
#     checkpoint_s3_uri=f's3://{bucket}/baichuan2checkpoints',
#     use_spot_instances=True,
#     max_wait=36000,
#     max_run=7200,
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
# data = {
#     'training': training_input_path,
#     'test': test_input_path
# }

# starting the train job with our uploaded datasets as input
from sagemaker.inputs import TrainingInput
train_input=TrainingInput(
        s3_data=data_path_s3,
        input_mode='File'  # Available options: File | Pipe | FastFile
    )

#start training
huggingface_estimator.fit({"train": train_input},wait=True)

## 合并lora和base模型

注意:下面操作需要在带有GPU显卡的SageMaker Notebook instance或EC2(Deep Learning AMI GPU PyTorch 2.0.1 (Amazon Linux 2) 20231003 AMI名称)上操作

In [None]:
#需要git clone baichuan2 github上的文件
!git clone https://github.com/baichuan-inc/Baichuan2.git

In [None]:
#登录到机器上, 需要使用Torch2.0,安装下列requirements.txt内容
numpy
transformers==4.28.1
sentencepiece
tokenizers
accelerate>=0.23.0
deepspeed>=0.8.3
xformers
scipy
bitsandbytes
peft

#进入到相应目录
#./lora目录存放finetune训练后的模型,从上面训练好后的S3文件夹中下载下来 
#pytorch_model.bin改名为adapter_model.bin; 
#添加adapter_config.json文件,内容如下
{
  "base_model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
  "bias": "none",
  "enable_lora": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "merge_weights": false,
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 1,
  "target_modules": [
    "W_pack"
  ],
  "task_type": "CAUSAL_LM"
}


In [None]:
# ./base目录存放base模型 可从huggingface或已存放的s3桶中下载


In [None]:
#创建merge.py文件,内容如下
from peft import PeftModel, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
#from peft import AutoPeftModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("base", local_files_only=True,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("base", local_files_only=True,device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True)
#inputs = tokenizer('登鹳雀楼->王之涣\n夜雨寄北->', return_tensors='pt')
#inputs = inputs.to('cuda:0')
#pred = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1)
#print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))


model_lora = PeftModel.from_pretrained(
    model,
    'lora',
    torch_dtype=torch.bfloat16,
)

base_model = model_lora.merge_and_unload()
base_model.save_pretrained('merge')


In [None]:
#然后python merge.py,
#./merge目录即合并后的模型, 
#然后拷贝./merge/*.bin到./base覆盖原有文件或一起都拷贝到新目录, 然后测试, 然后上传到s3桶, 最后按之前的部署方式进行部署测试.

## 推理测试

In [None]:
#按之前的部署方式 baichuan2-LMI-HuggingFaceAccelerateModel.ipynb 进行部署测试.
