### SageMaker fine tune ChatGLM

#### 准备
1. 升级boto3, sagemaker python sdk  
2. 准备requirements.txt
3. 准备s5cmd utility

In [None]:
!pip install --upgrade boto3
!pip install --upgrade sagemaker

In [None]:
#print('s3://{}/llm/models/'.format(sagemaker_session.default_bucket()))
#!aws s3 ls s3://sagemaker-us-west-2-687912291502/llm/models/
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz && mv s5cmd ChatGLM-6B/ptuning/

In [None]:
import boto3
import sagemaker

account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

print(role)
print(bucket)

### chatglm 官方P-tuning v2方式（单机单卡）
1:安装依赖lib   
2:准备数据集(本例以ADGEN 文本生成数据集为例，将解压后的 AdvertiseGen 目录放到本目录  
3:修改并bash运行 train.sh  

In [None]:
#!pip install rouge_chinese nltk jieba datasets
#!git clone https://github.com/THUDM/ChatGLM-6B.git
#!pip install -r ChatGLM-6B/requirements.txt
!cp ChatGLM-6B/requirements.txt ChatGLM-6B/ptuning/

In [None]:
#!cd ChatGLM-6B/ptuning/ && wget "https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1"
#!cd ChatGLM-6B/ptuning/ && mv "index.html?dl=1" dataset.tar.gz
#!cd ChatGLM-6B/ptuning/ && tar -xvf dataset.tar.gz
!./ChatGLM-6B/ptuning/s5cmd sync ChatGLM-6B/ptuning/AdvertiseGen/ s3://{bucket}/llm/chatglm/datasets/ 
#!rm -rf cd ChatGLM-6B/ptuning/dataset.tar.gz


In [None]:
# define Training Job Name 
import time
from sagemaker.huggingface import HuggingFace
instance_type="ml.g4dn.2xlarge"

job_name = f'huggingface-chatglm-simple-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
#define the model s3 path which will store your trained model asset
#Note: you should use your real s3 path to configure model_s3_path
model_s3_path='s3://{}/llm/models/chatglm/simple/'.format(sagemaker_session.default_bucket())
output_dir = '/opt/ml/model/adgen-chatglm-6b-ft'
model_name_or_path = 'THUDM/chatglm-6b'


instance_count = 1
#define the enviroment variables for your scripts.
environment = {
              'MODEL_S3_PATH'          : model_s3_path,
              'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
              #'LD_LIBRARY_PATH'        : '${LD_LIBRARY_PATH}:/opt/conda/lib/',
              'TRAIN_DATASET'          : '/opt/ml/input/data/AdvertiseGen/train.json',
              'TEST_DATASET'           : '/opt/ml/input/data/AdvertiseGen/dev.json',
              'PROMPT_COLUMN'          : 'content',
              'RESPONSE_COLUMN'        : 'summary',
              'MODEL_NAME_OR_PATH'     : model_name_or_path,
              'OUTPUT_DIR'             : output_dir,
              'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
              'TRAIN_STEPS'            : '100'
}

inputs={
   'AdvertiseGen': f"s3://{bucket}/llm/chatglm/datasets/"
}


# create the Estimator
from sagemaker.pytorch import PyTorch

#huggingface_estimator = PyTorch(
#                            entry_point          = 'start_simple.py',          # user endpoint script
#                            source_dir           = 'ChatGLM-6B/ptuning',
#                            role=role,
#                            framework_version='1.13',
#                            py_version='py39',
#                            script_mode=True,
#                            instance_count=1,  # 1 or 2 or ...
#                            instance_type=instance_type,
#                            environment = environment)

huggingface_estimator = HuggingFace(
    entry_point          = 'start_simple.py',          # user endpoint script
    source_dir           = 'ChatGLM-6B/ptuning',               # directory which includes all the files needed for training
    instance_type        = instance_type, # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,           # Iam role used in training job to access AWS ressources, e.g. S3
    script_mode          = True,
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',            # the pytorch_version version used in the training job
    py_version           = 'py39',            # the python version used in the training job
    environment = environment
)

huggingface_estimator.fit(inputs=inputs)

In [None]:
huggingface_estimator.model_data

In [None]:
!aws s3 ls s3://sagemaker-us-west-2-687912291502/huggingface-chatglm-simple-2023-05-06-1-2023-05-06-14-24-12-728/output/model.tar.gz --recursive --human-readable --summarize

For local test only

In [None]:
!cd ChatGLM-6B/ptuning/&& bash train.sh

### chatglm 官方deepspeed方式（全参数的Finetune,单机多卡）
1: 准备deepspeed lib，并修改deepspeed.json    
2：数据集（以上一致）  
3：entrypoint start-single-node.py,设置num-gpus  
4：触发bash ds_train_finetune_single_node.sh 

In [None]:
instance_type = 'ml.p4d.24xlarge'
if instance_type in [
    "ml.p3.16xlarge",
    "ml.p3dn.24xlarge",
    "ml.g5.48xlarge",
    "ml.p4d.24xlarge"    
]:
    processes_per_host = 8
elif instance_type == "ml.p2.16xlarge":
    processes_per_host = 16
else:
    processes_per_host = 4

print("processes_per_host is set to:", processes_per_host)

In [None]:
# define Training Job Name 
job_name = f'huggingface-chatglm-deepspeed-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
#define the model s3 path which will store your trained model asset
#Note: you should use your real s3 path to configure model_s3_path
model_s3_path='s3://{}/llm/models/chatglm/deepspeed/'.format(sagemaker_session.default_bucket())
output_dir = '/tmp/model/adgen-chatglm-6b-ft'
model_name_or_path = 'THUDM/chatglm-6b'


instance_count = 1
#define the enviroment variables for your scripts.
environment = {
              'MODEL_S3_PATH'          : model_s3_path,
              'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
              #'LD_LIBRARY_PATH'        : '${LD_LIBRARY_PATH}:/opt/conda/lib/',
              'NUM_GPUS'               : str(processes_per_host),
              'TRAIN_DATASET'          : '/opt/ml/input/data/AdvertiseGen/train.json',
              'TEST_DATASET'           : '/opt/ml/input/data/AdvertiseGen/dev.json',
              'PROMPT_COLUMN'          : 'content',
              'RESPONSE_COLUMN'        : 'summary',
              'MODEL_NAME_OR_PATH'     : model_name_or_path,
              'OUTPUT_DIR'             : output_dir,
              'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
              'TRAIN_STEPS'            :'50'
}

inputs={
   'AdvertiseGen': f"s3://{bucket}/llm/chatglm/datasets/"
}


# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'start-single-node.py',          # user endpoint script
    source_dir           = 'ChatGLM-6B/ptuning',               # directory which includes all the files needed for training
    instance_type        = instance_type, # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    script_mode          = True,
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',            # the pytorch_version version used in the training job
    py_version           = 'py39',            # the python version used in the training job
    environment = environment,
)
huggingface_estimator.fit(inputs=inputs)

In [None]:
huggingface_estimator.model_data

### chatglm deepspeed 多机多卡改造
1: 准备deepspeed lib，并修改deepspeed.json    
2：数据集（以上一致）  
3：entrypoint start.py,设置torch distribute launch configure  
4：触发bash ds_train_finetune.sh 

In [None]:
# define Training Job Name 
job_name = f'huggingface-chatglm-deepspeed-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
#define the model s3 path which will store your trained model asset
#Note: you should use your real s3 path to configure model_s3_path
model_s3_path='s3://{}/llm/models/chatglm/deepspeed/'.format(sagemaker_session.default_bucket())
output_dir = '/tmp/model/adgen-chatglm-6b-ft'
model_name_or_path = 'THUDM/chatglm-6b'


instance_count = 2
#define the enviroment variables for your scripts.
environment = {
              'NODE_NUMBER'            : str(instance_count),
              'MODEL_S3_PATH'          : model_s3_path,
              'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
              #'LD_LIBRARY_PATH'        : '${LD_LIBRARY_PATH}:/opt/conda/lib/',
              'NUM_GPUS'               : str(processes_per_host),
              'TRAIN_DATASET'          : '/opt/ml/input/data/AdvertiseGen/train.json',
              'TEST_DATASET'           : '/opt/ml/input/data/AdvertiseGen/dev.json',
              'PROMPT_COLUMN'          : 'content',
              'RESPONSE_COLUMN'        : 'summary',
              'MODEL_NAME_OR_PATH'     : model_name_or_path,
              'OUTPUT_DIR'             : output_dir,
              'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
              'TRAIN_STEPS'            :'50'
}

inputs={
   'AdvertiseGen': f"s3://{bucket}/llm/chatglm/datasets/"
}


# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'start.py',          # user endpoint script
    source_dir           = 'ChatGLM-6B/ptuning',               # directory which includes all the files needed for training
    instance_type        = instance_type, # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    script_mode          = True,
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',            # the pytorch_version version used in the training job
    py_version           = 'py39',            # the python version used in the training job
    environment = environment,
)
huggingface_estimator.fit(inputs=inputs)