### SageMaker fine tune baichuan

#### 准备
1. 升级boto3, sagemaker python sdk  
2. 准备requirements.txt

In [None]:
!pip install --upgrade boto3
!pip install --upgrade sagemaker

In [2]:
import boto3
import sagemaker

account = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

print(role)
print(bucket)



arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
sagemaker-us-west-2-687912291502


### baichuan fine tune 
 deepspeed+QLoRA

In [37]:
# 我们在原始的LLaMA-Efficient-Tuning.git 基础上做了一些魔改 1.每次save同步到S3，2.调整训练参数
!rm -rf ./LLaMA-Efficient-Tuning
!git clone https://github.com/hiyouga/LLaMA-Efficient-Tuning.git
!cp ./s5cmd ./LLaMA-Efficient-Tuning/

Cloning into 'LLaMA-Efficient-Tuning'...
remote: Enumerating objects: 1100, done.[K
remote: Counting objects: 100% (570/570), done.[K
remote: Compressing objects: 100% (171/171), done.[K
remote: Total 1100 (delta 471), reused 455 (delta 398), pack-reused 530[K
Receiving objects: 100% (1100/1100), 72.44 MiB | 13.17 MiB/s, done.
Resolving deltas: 100% (730/730), done.
Updating files: 100% (98/98), done.


## prepare docker images，如果准备好就跳过

In [3]:
%%writefile Dockerfile
## You should change below region code to the region you used, here sample is use us-west-2
From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04 
#From pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime

ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
RUN pip3 uninstall -y deepspeed && pip3 install deepspeed
#RUN pip install -U git+https://github.com/ssbuild/deep_training.git


Overwriting Dockerfile


In [4]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


**Build image and push to ECR.**

In [10]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-baichuan_finetuning"

In [6]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  81.59GB
Step 1/5 : From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04
 ---> c5a6ef695006
Step 2/5 : ENV LANG=C.UTF-8
 ---> Using cache
 ---> af49cfa7feae
Step 3/5 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 287106637dc6
Step 4/5 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 773b4cf30c90
Step 5/5 : RUN pip3 uninstall -y deepspeed && pip3 install deepspeed
 ---> Using cache
 ---> ce72201e73cd
Successfully built ce72201e73cd
Successfully tagged sagemaker-baichuan_finetuning:latest
The push refers to repository [687912291502.dkr.ecr.us-west-2.amazonaws.com/sagemaker-baichuan_finetuning]
02a87473f68b: Preparing
f8dae5c3df1e: Preparing
e3221f18601a: Preparing
b6f286626882: Preparing
76fe97d80cdb: Preparing
f5f76489fff8: Preparing
621c3f07daa7: Preparing
9b484bb42e11: Preparing
54c7c0b58471: Preparing
c34adc3ab668: Preparing
bbf651e48b84

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



## train GPT4 baichuan

In [None]:
!pip install huggingface_hub

In [7]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./LLM_baichuan_model_13b")
local_model_path.mkdir(exist_ok=True)

#commit_hash = "ba9db8ed916eb8c4d4349d40ef7a0b6b68a0b930"
model_name = 'baichuan-inc/Baichuan-13B-Chat'
model_cache_path = local_model_path
#snapshot_download(repo_id=model_name, revision=commit_hash,cache_dir=local_model_path)
#snapshot_download(repo_id=model_name,cache_dir=model_cache_path)

In [8]:
s3_model_prefix = "llm/models/LLM_baichuan_model_13b"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "llm/models/LLM_baichuan_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: llm/models/LLM_baichuan_deploy_code
model_snapshot_path: LLM_baichuan_model_13b/models--baichuan-inc--Baichuan-13B-Chat/snapshots/ff1fbc5e10eb514c3ee54aeff36a4e703b8d9e9a


In [None]:
!aws s3 rm s3://{bucket}/{s3_model_prefix} --recursive
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

In [9]:
####下载的baichuan model s3路径 ###########
# 这里修改成自己的S3地址
model_s3_path="s3://sagemaker-us-west-2-687912291502/llm/models/LLM_baichuan_model_13b/"

# base_model_path = 's3://sagemaker-us-west-2-960661357527/llm/models/LLM_baichuan_model/'

## 准备好镜像

In [10]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-baichuan_finetuning"

In [11]:
## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account, region, repo_name)
image_uri

'687912291502.dkr.ecr.us-west-2.amazonaws.com/sagemaker-baichuan_finetuning:latest'

In [22]:
%%writefile ./LLaMA-Efficient-Tuning/requirements.txt
#transformers>=4.29.1
#datasets>=2.12.0
#accelerate>=0.19.0
#peft>=0.3.0
#trl>=0.4.4
#sentencepiece
#jieba
#rouge-chinese
#nltk
#gradio
#mdtex2html
#uvicorn
#fastapi
#sse-starlette
#transformers_stream_generator
#deepspeed
#xformers
#wandb
transformers>=4.29.2
datasets>=2.12.0
accelerate>=0.19.0
peft>=0.3.0
trl>=0.4.4
sentencepiece
jieba
rouge-chinese
nltk
gradio
mdtex2html
uvicorn
fastapi
sse-starlette
transformers_stream_generator

Overwriting ./LLaMA-Efficient-Tuning/requirements.txt


In [23]:
%%writefile ./LLaMA-Efficient-Tuning/train.sh
#!/bin/bash

chmod +x ./s5cmd
#chmod +x ./monitor.sh

pip install -U -r requirements.txt

apt install -y inotify-tools
cp ./s5cmd /usr/bin



./s5cmd sync $MODEL_S3_PATH* /tmp/baichun-13b/
cp tests/modeling_baichuan.py /tmp/baichuan-13b/

# This is secret and shouldn't be checked into version control
export  WANDB_API_KEY="64f29a79439a4153b2b9f42f05eba7c3c5ca7b95"
# Name and notes optional
export WANDB_PROJECT="sm-baichuan7b-sft"


export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
accelerate launch --config_file ds_zero2.yaml  src/train_bash.py \
    --model_name_or_path "/tmp/baichun-13b/" \
    --do_train \
    --dataset zhetian_sft \
    --finetuning_type lora \
    --output_dir /tmp/ouput/ \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 2 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 100 \
    --learning_rate 3e-4 \
    --num_train_epochs 2 \
    --plot_loss \
    --bf16 \
    --report_to wandb \
    --padding_side right \
    --prompt_template baichuan \
    --lora_rank 64 \
    --lora_alpha 128 \
    --max_source_length 1024 \
    --lora_target "W_pack,o_proj,gate_proj,up_proj,down_proj"

#./s5cmd sync /tmp/ouput/ $MODEL_S3_PATH/models/baichuan_finetuning/output/$cur_date/


Overwriting ./LLaMA-Efficient-Tuning/train.sh


In [24]:
model_s3_path

's3://sagemaker-us-west-2-687912291502/llm/models/LLM_baichuan_model_13b/'

In [25]:
use_spot_instances = False
max_run = 3600*24
max_wait = 3600*24*2 if use_spot_instances else None

In [None]:
import time
from sagemaker.estimator import Estimator

environment = {
              'MODEL_S3_PATH': model_s3_path # The bucket to store pretrained model and fine-tune model
}

base_job_name = 'baichuan-finetuning'

# instance_type = 'ml.g5.12xlarge'

# instance_type = 'ml.g5.48xlarge'

instance_type = 'ml.p4d.24xlarge'


estimator = Estimator(role=role,
                      entry_point='train.sh',
                      source_dir='./LLaMA-Efficient-Tuning/',
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      disable_profiler=True,
                      debugger_hook_config=False,
                      max_run=max_run,
                      use_spot_instances=use_spot_instances,
                      max_wait=max_wait
                     )

estimator.fit()

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: baichuan-finetuning-2023-07-19-15-00-41-256


2023-07-19 15:01:03 Starting - Starting the training job......
2023-07-19 15:01:49 Starting - Preparing the instances for training.....................
2023-07-19 15:05:30 Downloading - Downloading input data...
2023-07-19 15:05:45 Training - Downloading the training image........................
2023-07-19 15:09:36 Training - Training image download completed. Training in progress.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-07-19 15:10:35,403 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-07-19 15:10:35,464 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-07-19 15:10:35,473 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-07-19 15:10:35,475 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[3

In [None]:
import os
import subprocess

In [None]:
# os.system('./s5cmd sync monitor.sh s3://sagemaker-us-west-2-960661357527/llm/models/LLM_baichuan_model/')

subprocess.run('./s5cmd sync Dockerfile s3://sagemaker-us-west-2-960661357527/llm/models/LLM_baichuan_model/', shell=True)

In [None]:
import datetime

In [None]:
datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

In [None]:
!pip install accelerate

## !accelerate config

In [90]:
import transformers

In [91]:
from transformers import AutoConfig

In [93]:
config = AutoConfig.from_pretrained('baichuan-inc/Baichuan-13B-Chat', trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/baichuan-inc/Baichuan-13B-Chat:
- configuration_baichuan.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [94]:
config

BaichuanConfig {
  "_from_model_config": true,
  "_name_or_path": "baichuan-inc/Baichuan-13B-Chat",
  "architectures": [
    "BaichuanForCausalLM"
  ],
  "auto_map": {
    "AutoConfig": "baichuan-inc/Baichuan-13B-Chat--configuration_baichuan.BaichuanConfig",
    "AutoModelForCausalLM": "baichuan-inc/Baichuan-13B-Chat--modeling_baichuan.BaichuanForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "gradient_checkpointing": [
    false
  ],
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 13696,
  "model_max_length": 4096,
  "model_type": "baichuan",
  "num_attention_heads": 40,
  "num_hidden_layers": 40,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.29.0",
  "use_cache": true,
  "vocab_size": 64000
}

128.0