# 1. 准备环境(仅新建笔记本实例时初始化使用)

## 1.1 准备Python SDK

In [None]:
!pip install --upgrade boto3
!pip install --upgrade sagemaker

In [None]:
!pip install huggingface_hub

## 1.2 克隆ChatGLM2代码

In [3]:
%%script bash
rm -rf ChatGLM2-6B
git clone https://github.com/THUDM/ChatGLM2-6B.git

Cloning into 'ChatGLM2-6B'...


## 1.3 准备s5cmd
除了给笔记本实例使用外，还要提供给训练实例使用

In [4]:
%%script bash
sudo rm /usr/local/bin/s5cmd
mkdir s5cmd_download
cd s5cmd_download
curl -L https://github.com/peak/s5cmd/releases/download/v2.1.0/s5cmd_2.1.0_Linux-64bit.tar.gz | tar -xz
chmod 777 ./s5cmd
cp ./s5cmd ../ChatGLM2-6B/ptuning/
chmod 777 ./s5cmd
cp ./s5cmd ../code
sudo mv s5cmd /usr/local/bin/
cd ..
rm -rf s5cmd_download

rm: cannot remove ‘/usr/local/bin/s5cmd’: No such file or directory
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4425k  100 4425k    0     0  7742k      0 --:--:-- --:--:-- --:--:-- 7742k


## 1.4 获取Runtime资源配置

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

print("Sagemaker Execution Role:")
print(role)
print("Sagemaker Default Bucket:")
print(sagemaker_default_bucket)
print("Sagemaker Boto Account:")
print(account)
print("Sagemaker Boto Region:")
print(region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Sagemaker Execution Role:
arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
Sagemaker Default Bucket:
sagemaker-us-west-2-687912291502
Sagemaker Boto Account:
687912291502
Sagemaker Boto Region:
us-west-2


## 1.5 下载ChatGLM2原始模型

In [4]:
from huggingface_hub import snapshot_download
from pathlib import Path


local_cache_path = Path("./model")
local_cache_path.mkdir(exist_ok=True)

model_name = "THUDM/chatglm2-6b"

# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.model", "*.py"]

model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_cache_path,
    allow_patterns=allow_patterns,
)

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

Downloading (…)l-00002-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Downloading (…)l-00001-of-00007.bin:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00007.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

Downloading (…)/modeling_chatglm.py:   0%|          | 0.00/50.7k [00:00<?, ?B/s]

Downloading (…)2620ce9f/config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading (…)iguration_chatglm.py:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading (…)l-00006-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00007.bin:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading (…)ce9f/quantization.py:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading (…)enization_chatglm.py:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

In [5]:
# Get the model files path
import os
from glob import glob

local_model_path = None

paths = os.walk(r'./model')
for root, dirs, files in paths:
    for file in files:
        if file == 'config.json':
            # print(os.path.join(root, file))
            local_model_path = str(os.path.join(root, file))[0:-11]
            print(local_model_path)
if local_model_path == None:
    print("Model download may failed, please check prior step!")

./model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/


## 1.6 将ChatGLM2原始模型拷贝到 S3

In [25]:
!s5cmd sync s3://sagemaker-us-west-2-687912291502/llm/datasets/chatglm2/KuaFuTrainData/* ./

cp s3://sagemaker-us-west-2-687912291502/llm/datasets/chatglm2/KuaFuTrainData/dev.json dev.json
cp s3://sagemaker-us-west-2-687912291502/llm/datasets/chatglm2/KuaFuTrainData/train.json train.json


In [6]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket local_model_path=$local_model_path bash

s5cmd sync ${local_model_path} s3://${sagemaker_default_bucket}/llm/models/chatglm2/original-6B/

# rm -rf model

cp model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/pytorch_model.bin.index.json s3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/pytorch_model.bin.index.json
cp model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/tokenization_chatglm.py s3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/tokenization_chatglm.py
cp model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/configuration_chatglm.py s3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/configuration_chatglm.py
cp model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/tokenizer.model s3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/tokenizer.model
cp model/models--THUDM--chatglm2-6b/snapshots/b1502f4f75c71499a3d566b14463edd62620ce9f/config.json s3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/config.json
cp 

# 2. 准备数据集

## 2.1 准备数据目录

夸父健康数据集:根据输入,输出广告词,如下所示:
```JSON
{
  "content":"需求:完成用户的运动建议的意见修改|数据信息:增加核心训练次数;参考信息:undefined",
  "summary":"根据您的修改意见，我增加了核心训练次数，以帮助您更好地改善体态异常。"
}
```
建立`KuaFuTrainData` 目录放到本目录下，并上传 `train.json` 与 `dev.json`。

In [8]:
# 创建文件夹
!mkdir KuaFuTrainData

# 自行上传数据

mkdir: cannot create directory ‘KuaFuTrainData’: File exists


## 2.2 将数据集到S3
也可以忽略2.1与2.2，在本地直接将数据上传至 s3://${sagemaker_default_bucket}/llm/datasets/chatglm2/KuaFuTrainData/

In [7]:
%%script env sagemaker_default_bucket=$sagemaker_default_bucket bash

s5cmd sync ./KuaFuTrainData/ s3://${sagemaker_default_bucket}/llm/datasets/chatglm2/KuaFuTrainData/

# rm -rf KuaFuTrainData

# 3. 开始微调模型准备

## 3.1 准备微调代码
### 复写 ChatGLM2-6B/ptuning/arguments.py 文件
与原始文件相比，增加了 ```model_output_s3_path``` 参数

In [11]:
%%writefile ChatGLM2-6B/ptuning/arguments.py

from dataclasses import dataclass, field
from typing import Optional


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
        
    model_output_s3_path: str = field(
        metadata={"help": "Path to model saved in s3 path using s5cmd utily"}
    )
    
        
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    ptuning_checkpoint: str = field(
        default=None, metadata={"help": "Path to p-tuning v2 checkpoints"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )
    quantization_bit: Optional[int] = field(
        default=None
    )
    pre_seq_len: Optional[int] = field(
        default=None
    )
    prefix_projection: bool = field(
        default=False
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    prompt_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
    )
    response_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
    )
    history_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the history of chat."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
            )
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    train_simple: bool = field(
        default=False, metadata={"help": "whether use single node single GPU fine tuning"}
    )
    train_mutipl: bool = field(
        default=False, metadata={"help": "whether use mutiple node mutiple GPU fine tuning"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the decoder_start_token_id."
                "Useful for multilingual models like mBART where the first generated token"
                "needs to be the target language token (Usually it is the target language token)"
            )
        },
    )

    

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None and self.test_file is None:
            raise ValueError("Need either a dataset name or a training/validation/test file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length

Overwriting ChatGLM2-6B/ptuning/arguments.py


### 增加 ChatGLM2-6B/ptuning/main_tuning.py

In [12]:
%%writefile ChatGLM2-6B/ptuning/main_tuning.py

#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.

import logging
import os
import sys
import json

import numpy as np
from datasets import load_dataset
import jieba 
from rouge_chinese import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch
import deepspeed
import torch.distributed as dist

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainingArguments,
    set_seed,
)
from trainer_seq2seq import Seq2SeqTrainer

from arguments import ModelArguments, DataTrainingArguments

logger = logging.getLogger(__name__)

def main():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
        
    ## if mutil-node train , set torch.distribute initial ###########
    #if data_args.train_mutipl:
    #    # Environment variables set by torch.distributed.launch
    #    LOCAL_RANK = int(os.environ['LOCAL_RANK'])
    #    WORLD_SIZE = int(os.environ['WORLD_SIZE'])
    #    WORLD_RANK = int(os.environ['RANK'])
    #    
    #    dist.init_process_group(backend='nccl', rank=WORLD_RANK, world_size=WORLD_SIZE)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    if training_args.should_log:
        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
        transformers.utils.logging.set_verbosity_info()

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    # datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Load dataset
    data_files = {}
    if data_args.train_file is not None:
        data_files["train"] = data_args.train_file
        extension = data_args.train_file.split(".")[-1]
    if data_args.validation_file is not None:
        data_files["validation"] = data_args.validation_file
        extension = data_args.validation_file.split(".")[-1]
    if data_args.test_file is not None:
        data_files["test"] = data_args.test_file
        extension = data_args.test_file.split(".")[-1]

    raw_datasets = load_dataset(
        extension,
        data_files=data_files,
        cache_dir=model_args.cache_dir,
        #use_auth_token=True if model_args.use_auth_token else None,
    )
    
    # if s3 path model, use s5cmd to download the model to /tmp/orignal/ for model load
    if model_args.model_name_or_path.startswith("s3"):
        #Note: if deepspeed fine tuning ,we just use the rank 0 process to download the model assets to S3 by s5cmd command.
        if data_args.train_simple == False:
            WORLD_RANK = int(os.environ['RANK'])
            print("world_rank==="+str(WORLD_RANK))
            if WORLD_RANK == 0:
                os.system("cp ./s5cmd  /tmp/ && chmod +x /tmp/s5cmd")	
                os.system("/tmp/s5cmd sync {0} {1}".format(model_args.model_name_or_path + "*", "/tmp/orignal/"))

                print("sync s3 model " + model_args.model_name_or_path + "to /tmp/orignal:")
                model_args.model_name_or_path = "/tmp/orignal/"
                print(os.listdir(model_args.model_name_or_path))
            else: 
                model_args.model_name_or_path = "/tmp/orignal/"
            #Note: we should sync with every ranker and ensure rank 0 uploading the model assets successfully. 
            torch.distributed.barrier()
        else:
            os.system("cp ./s5cmd  /tmp/ && chmod +x /tmp/s5cmd")	
            os.system("/tmp/s5cmd sync {0} {1}".format(model_args.model_name_or_path + "*", "/tmp/orignal/"))

            print("sync s3 model " + model_args.model_name_or_path + "to /tmp/orignal:")
            model_args.model_name_or_path = "/tmp/orignal/"
            print(os.listdir(model_args.model_name_or_path))

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
    config.pre_seq_len = model_args.pre_seq_len
    config.prefix_projection = model_args.prefix_projection

    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)

    if model_args.ptuning_checkpoint is not None:
        # Evaluation
        # Loading extra state dict of prefix encoder
        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
        prefix_state_dict = torch.load(os.path.join(model_args.ptuning_checkpoint, "pytorch_model.bin"))
        new_prefix_state_dict = {}
        for k, v in prefix_state_dict.items():
            if k.startswith("transformer.prefix_encoder."):
                new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
        model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    else:
        model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
        #model = AutoModel.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)

    if model_args.quantization_bit is not None:
        print(f"Quantized to {model_args.quantization_bit} bit")
        model = model.quantize(model_args.quantization_bit)
    if model_args.pre_seq_len is not None:
        # P-tuning v2
        model = model.half()
        model.transformer.prefix_encoder.float()
    else:
        # Finetune
        model = model.float()

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    elif training_args.do_predict:
        column_names = raw_datasets["test"].column_names
    else:
        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
        return

    # Get the column names for input/target.
    prompt_column = data_args.prompt_column
    response_column = data_args.response_column
    history_column = data_args.history_column
    
    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length

    def preprocess_function_eval(examples):
        inputs, targets = [], []
        for i in range(len(examples[prompt_column])):
            if examples[prompt_column][i] and examples[response_column][i]:
                query = examples[prompt_column][i]
                history = examples[history_column][i] if history_column is not None else None
                prompt = tokenizer.build_prompt(query, history)
                inputs.append(prompt)
                targets.append(examples[response_column][i])

        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, truncation=True, padding=True)
        labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

        if data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]
        model_inputs["labels"] = labels["input_ids"]

        return model_inputs

    def preprocess_function_train(examples):
        max_seq_length = data_args.max_source_length + data_args.max_target_length + 1

        model_inputs = {
            "input_ids": [],
            "labels": [],
        }
        for i in range(len(examples[prompt_column])):
            if examples[prompt_column][i] and examples[response_column][i]:
                query, answer = examples[prompt_column][i], examples[response_column][i]

                history = examples[history_column][i] if history_column is not None else None
                prompt = tokenizer.build_prompt(query, history)

                prompt = prefix + prompt
                a_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
                                         max_length=data_args.max_source_length)
                b_ids = tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
                                         max_length=data_args.max_target_length)

                context_length = len(a_ids)
                input_ids = a_ids + b_ids + [tokenizer.eos_token_id]
                labels = [tokenizer.pad_token_id] * context_length + b_ids + [tokenizer.eos_token_id]
                
                pad_len = max_seq_length - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
                labels = labels + [tokenizer.pad_token_id] * pad_len
                if data_args.ignore_pad_token_for_loss:
                    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

                model_inputs["input_ids"].append(input_ids)
                model_inputs["labels"].append(labels)

        return model_inputs
    
    def print_dataset_example(example):
        print("input_ids", example["input_ids"])
        print("inputs", tokenizer.decode(example["input_ids"]))
        print("label_ids", example["labels"])
        print("labels", tokenizer.decode(example["labels"]))

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function_train,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        print_dataset_example(train_dataset[0])

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function_eval,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
        print_dataset_example(eval_dataset[0])

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function_eval,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )
        print_dataset_example(predict_dataset[0])

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=None,
        padding=False
    )

    # Metric
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if data_args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "rouge-1": [],
            "rouge-2": [],
            "rouge-l": [],
            "bleu-4": []
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            hypothesis = list(jieba.cut(pred))
            reference = list(jieba.cut(label))
            rouge = Rouge()
            scores = rouge.get_scores(' '.join(hypothesis) , ' '.join(reference))
            result = scores[0]
            
            for k, v in result.items():
                score_dict[k].append(round(v["f"] * 100, 4))
            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
            score_dict["bleu-4"].append(round(bleu_score * 100, 4))

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Override the decoding parameters of Seq2SeqTrainer
    training_args.generation_max_length = (
        training_args.generation_max_length
        if training_args.generation_max_length is not None
        else data_args.val_max_target_length
    )
    training_args.generation_num_beams = (
        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
    )
    # Initialize our Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
        save_changed=model_args.pre_seq_len is not None
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        # elif last_checkpoint is not None:
        #     checkpoint = last_checkpoint
        model.gradient_checkpointing_enable()
        model.enable_input_require_grads()
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        # trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        
        print("------saving model!-----")	
        	
        save_model_dir = os.environ['OUTPUT_DIR']	
        tokenizer.save_pretrained(save_model_dir)	
        trainer.save_model(save_model_dir)	
        print("save_model_dir : {}".format(save_model_dir))	
        print("------model is saved!-----")
        
        print("model sync to : {}".format(os.environ['MODEL_OUTPUT_S3_PATH']))
        
        #Note: if deepspeed fine tuning ,we just use the rank 0 process to upload the trained model assets to S3 by s5cmd command.
        if data_args.train_simple == False:
            WORLD_RANK = int(os.environ['RANK'])
            print("world_rank==="+str(WORLD_RANK))
            if WORLD_RANK == 0:
                os.system("./s5cmd sync {0} {1}".format(save_model_dir, os.environ['MODEL_OUTPUT_S3_PATH']))
            #Note: we should sync with every ranker and ensure rank 0 uploading the model assets successfully. 
            torch.distributed.barrier()
        else:
            os.system("./s5cmd sync {0} {1}".format(save_model_dir, os.environ['MODEL_OUTPUT_S3_PATH']))

        

    # Evaluation
    results = {}
    max_seq_length = data_args.max_source_length + data_args.max_target_length + 1
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(metric_key_prefix="eval", do_sample=True, top_p=0.7, max_length=max_seq_length, temperature=0.95)
        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")
        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict", max_length=max_seq_length, do_sample=True, top_p=0.7, temperature=0.95)
        metrics = predict_results.metrics
        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        if trainer.is_world_process_zero():
            if training_args.predict_with_generate:
                predictions = tokenizer.batch_decode(
                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                predictions = [pred.strip() for pred in predictions]
                labels = tokenizer.batch_decode(
                    predict_results.label_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                labels = [label.strip() for label in labels]
                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
                with open(output_prediction_file, "w", encoding="utf-8") as writer:
                    for p, l in zip(predictions, labels):
                        res = json.dumps({"labels": l, "predict": p}, ensure_ascii=False)
                        writer.write(f"{res}\n")
    return results


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()

Overwriting ChatGLM2-6B/ptuning/main_tuning.py


### 增加 ChatGLM2-6B/ptuning/deepspeed.json 文件


In [None]:
%%writefile ChatGLM2-6B/ptuning/deepspeed.json
{
  "train_micro_batch_size_per_gpu": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "initial_scale_power": 16,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": "auto",
      "betas": "auto",
      "eps": "auto",
      "weight_decay": "auto"
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": "auto",
      "warmup_max_lr": "auto",
      "warmup_num_steps": "auto"
    }
  },
  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "overlap_comm": false,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "contiguous_gradients" : true
  }
}

### 增加 ChatGLM2-6B/ptuning/sm_fulltuning_train.sh 文件
官方 LR=1e-4  
CUDA_VISIBLE_DEVICES 为GPU参数  
参考以下文件：  
CChatGLM2-6B/ptuning/ds_train_finetune.sh

In [33]:
%%writefile ChatGLM2-6B/ptuning/sm_fulltuning_train.sh

LR=1e-4
MASTER_PORT="23456"

deepspeed --num_gpus=$NUM_GPUS  --master_port $MASTER_PORT main_tuning.py \
    --deepspeed deepspeed.json \
    --do_train \
    --train_file $TRAIN_DATASET \
    --validation_file $TEST_DATASET \
    --prompt_column ${PROMPT_COLUMN} \
    --response_column ${RESPONSE_COLUMN} \
    --overwrite_cache \
    --model_name_or_path ${MODEL_NAME_OR_PATH}   \
    --output_dir ${OUTPUT_DIR} \
    --model_output_s3_path ${MODEL_OUTPUT_S3_PATH} \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --predict_with_generate \
    --max_steps ${TRAIN_STEPS} \
    --logging_steps 10 \
    --save_steps ${TRAIN_STEPS} \
    --learning_rate $LR \
    --fp16

Overwriting ChatGLM2-6B/ptuning/sm_fulltuning_train.sh


### 增加 sm_fulltune_train.py 文件
此文件为启动训练的 Entry Point

In [15]:
%%writefile ChatGLM2-6B/ptuning/sm_fulltune_train.py

import os

if __name__ == "__main__":
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = str(os.environ['PYTORCH_CUDA_ALLOC_CONF'])
    os.environ['LD_LIBRARY_PATH'] = str(os.environ['LD_LIBRARY_PATH'])
    os.environ['TRAIN_DATASET'] = str(os.environ['TRAIN_DATASET'])
    os.environ['TEST_DATASET'] = str(os.environ['TEST_DATASET'])
    os.environ['PROMPT_COLUMN'] = str(os.environ['PROMPT_COLUMN'])
    os.environ['RESPONSE_COLUMN'] = str(os.environ['RESPONSE_COLUMN'])
    os.environ['MODEL_NAME_OR_PATH'] = str(os.environ['MODEL_NAME_OR_PATH'])
    os.environ['OUTPUT_DIR'] = str(os.environ['OUTPUT_DIR'])
    os.environ['MODEL_OUTPUT_S3_PATH'] = str(os.environ['MODEL_OUTPUT_S3_PATH'])

    # os.system("chmod +x ./s5cmd")
    os.system("/bin/bash sm_fulltuning_train.sh")

Writing ChatGLM2-6B/ptuning/sm_fulltune_train.py


### 增加 ChatGLM2-6B/ptuning/requirements.txt 文件

In [73]:
%%writefile ChatGLM2-6B/ptuning/requirements.txt

protobuf
#git+https://github.com/huggingface/transformers.git@68d640f7c368bcaaaecfc678f11908ebbd3d6176
transformers==4.28.1
cpm_kernels
#torch>=2.0
gradio
mdtex2html
sentencepiece
accelerate>=0.20.1
sse-starlette
streamlit>=1.24.0
datasets
huggingface
jieba
rouge_chinese
nltk
deepspeed==0.9.5

Overwriting ChatGLM2-6B/ptuning/requirements.txt


## 3.2 获取Runtime资源配置(未执行 1.4 可执行这里)

In [17]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

print("Sagemaker Execution Role:")
print(role)
print("Sagemaker Default Bucket:")
print(sagemaker_default_bucket)
print("Sagemaker Boto Account:")
print(account)
print("Sagemaker Boto Region:")
print(region)

Sagemaker Execution Role:
arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
Sagemaker Default Bucket:
sagemaker-us-west-2-687912291502
Sagemaker Boto Account:
687912291502
Sagemaker Boto Region:
us-west-2


## 3.2 Full-Tuning 训练
### 定义微调参数

In [28]:
# Define Training Job Name
import time
from sagemaker.huggingface import HuggingFace

job_name = f'huggingface-chatglm-finetune-fulltuning-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

instance_type  = "ml.p4d.24xlarge"
instance_count = 1
processes_per_host = 8

# 基础模型存放地址
model_name_or_path = 's3://{}/llm/models/chatglm2/original-6B/'.format(sagemaker_default_bucket)

# 微调模型输出地址
output_dir         = '/opt/ml/model/adgen-chatglm2-6b-ft'
model_s3_path      = 's3://{}/llm/models/chatglm2/finetune-fulltuning-adgen/'.format(sagemaker_default_bucket)

# 模型环境变量设置
environment = {
    'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
    'NUM_GPUS'               : str(processes_per_host),
    'TRAIN_DATASET'          : '/opt/ml/input/data/KuaFuTrainData/train.json',
    'TEST_DATASET'           : '/opt/ml/input/data/KuaFuTrainData/dev.json',
    'PROMPT_COLUMN'          : 'content',
    'RESPONSE_COLUMN'        : 'summary',
    'MODEL_NAME_OR_PATH'     : model_name_or_path,
    'OUTPUT_DIR'             : output_dir,
    'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
    'TRAIN_STEPS'            : '50'
}

inputs = {
   'KuaFuTrainData': f"s3://{sagemaker_default_bucket}/llm/datasets/chatglm2/KuaFuTrainData/"
}

In [29]:
environment

{'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
 'NUM_GPUS': '8',
 'TRAIN_DATASET': '/opt/ml/input/data/KuaFuTrainData/train.json',
 'TEST_DATASET': '/opt/ml/input/data/KuaFuTrainData/dev.json',
 'PROMPT_COLUMN': 'content',
 'RESPONSE_COLUMN': 'summary',
 'MODEL_NAME_OR_PATH': 's3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/original-6B/',
 'OUTPUT_DIR': '/opt/ml/model/adgen-chatglm2-6b-ft',
 'MODEL_OUTPUT_S3_PATH': 's3://sagemaker-us-west-2-687912291502/llm/models/chatglm2/finetune-fulltuning-adgen/',
 'TRAIN_STEPS': '50'}

### 启动微调训练

In [30]:
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_fulltune_train.py',
    source_dir           = './ChatGLM2-6B/ptuning',
    instance_type        = instance_type,
    instance_count       = instance_count,
    base_job_name        = job_name,
    role                 = role,
    script_mode          = True,
    transformers_version = '4.28',
    pytorch_version      = '2.0',
    py_version           = 'py310',
    environment          = environment
)

In [31]:
huggingface_estimator.fit(inputs=inputs)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-chatglm-finetune-fulltuning-2024-09-26-10-18-23-050


2024-09-26 10:18:26 Starting - Starting the training job
2024-09-26 10:18:26 Pending - Training job waiting for capacity......
2024-09-26 10:19:09 Pending - Preparing the instances for training........................
2024-09-26 10:23:28 Downloading - Downloading input data...
2024-09-26 10:23:48 Downloading - Downloading the training image........................
2024-09-26 10:27:46 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-09-26 10:28:39,312 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-09-26 10:28:39,427 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-09-26 10:28:39,434 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-09-26 10:28:39,436 sagemaker_pyto

# 4. 模型部署

## 4.1 获取Runtime资源配置

In [9]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess                     = sagemaker.Session()
role                     = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account                  = sess.boto_session.client("sts").get_caller_identity()["Account"]
region                   = sess.boto_session.region_name

print("Sagemaker Execution Role:")
print(role)
print("Sagemaker Default Bucket:")
print(sagemaker_default_bucket)
print("Sagemaker Boto Account:")
print(account)
print("Sagemaker Boto Region:")
print(region)

Sagemaker Execution Role:
arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
Sagemaker Default Bucket:
sagemaker-us-west-2-687912291502
Sagemaker Boto Account:
687912291502
Sagemaker Boto Region:
us-west-2


## 4.2 准备Dummy模型

In [29]:
!touch dummy
!tar czvf model.tar.gz dummy
assets_dir = 's3://{0}/{1}/assets/'.format(sagemaker_default_bucket, 'chatglm2')
model_data = 's3://{0}/{1}/assets/model.tar.gz'.format(sagemaker_default_bucket, 'chatglm2')
!aws s3 cp model.tar.gz $assets_dir
!rm -f dummy model.tar.gz

dummy
upload: ./model.tar.gz to s3://sagemaker-us-west-2-687912291502/chatglm2/assets/model.tar.gz


## 4.3 配置模型参数

In [None]:
model_name                  = None
entry_point                 = 'chatglm2-inference-finetune.py'
framework_version           = '2.0'
py_version                  = 'py310'
base_model_name_or_path     = 's3://{}/llm/models/chatglm2/original-6B/'.format(sagemaker_default_bucket)
finetune_model_name_or_path = 's3://{}/llm/models/chatglm2/finetune-ptuning-adgen/adgen-chatglm2-6b-ft/checkpoint-1000/pytorch_model.bin'.format(sagemaker_default_bucket)

# 模型环境变量设置
model_environment  = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT': '600',
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1',
    'MODEL_NAME_OR_PATH'            : base_model_name_or_path,
    'FINETUNE_MODEL_NAME_OR_PATH'   : finetune_model_name_or_path,
}


In [None]:
from sagemaker.pytorch.model import PyTorchModel

model = PyTorchModel(
    name              = model_name,
    model_data        = model_data,
    entry_point       = entry_point,
    source_dir        = './code',
    role              = role,
    framework_version = framework_version, 
    py_version        = py_version,
    env               = model_environment
)

In [None]:
! chmod +x /usr/local/bin/s5cmd
! cp /usr/local/bin/s5cmd code/

## BYOC 方式打包训练

In [63]:
%%writefile Dockerfile
## You should change below region code to the region you used, here sample is use us-west-2
#From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04 
From 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker
#From pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime

ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

#RUN pip3 uninstall -y deepspeed && pip3 install deepspeed

## Make all local GPUs visible
ENV NVIDIA_VISIBLE_DEVICES="all"

Overwriting Dockerfile


In [64]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [10]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-chatglm2-demo"

In [8]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Process is interrupted.


In [11]:
%%writefile ./ChatGLM2-6B/ptuning/sm_fulltuning_train.sh
#!/bin/bash
#echo "Y"|pip uninstall deepspeed
#pip install pydantic==1.9.2
#pip3 install --force-reinstall torch  --index-url https://download.pytorch.org/whl/cu118
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(dirname $(find / -name libcudart.so 2>/dev/null | head -n 1))
#export DS_BUILD_FUSED_ADAM=1 && pip install deepspeed==0.9.5
git clone https://github.com/microsoft/DeepSpeed.git
cd DeepSpeed
DS_BUILD_UTILS=1 DS_BUILD_FUSED_ADAM=1 pip install .
cd .. && pip install -r ./requirements.txt
LR=1e-4
MASTER_PORT="23456"

deepspeed --num_gpus=$NUM_GPUS  --master_port $MASTER_PORT main_tuning.py \
    --deepspeed deepspeed.json \
    --do_train \
    --train_file $TRAIN_DATASET \
    --validation_file $TEST_DATASET \
    --prompt_column ${PROMPT_COLUMN} \
    --response_column ${RESPONSE_COLUMN} \
    --overwrite_cache \
    --model_name_or_path ${MODEL_NAME_OR_PATH}   \
    --output_dir ${OUTPUT_DIR} \
    --model_output_s3_path ${MODEL_OUTPUT_S3_PATH} \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --predict_with_generate \
    --max_steps ${TRAIN_STEPS} \
    --logging_steps 10 \
    --save_steps ${TRAIN_STEPS} \
    --learning_rate $LR \
    --fp16

#./s5cmd sync /tmp/output/ s3://$MODEL_S3_BUCKET/models/chatglm-lora/output/$(date +%Y-%m-%d-%H-%M-%S)/

Overwriting ./ChatGLM2-6B/ptuning/sm_fulltuning_train.sh


In [12]:
%%writefile ChatGLM2-6B/ptuning/requirements.txt
protobuf
#git+https://github.com/huggingface/transformers.git@68d640f7c368bcaaaecfc678f11908ebbd3d6176
transformers==4.30.1
cpm_kernels
#torch>=2.0
gradio
mdtex2html
sentencepiece
accelerate==0.23.0
sse-starlette
streamlit>=1.24.0
datasets
huggingface
jieba
rouge_chinese
nltk

Overwriting ChatGLM2-6B/ptuning/requirements.txt


In [13]:
# Define Training Job Name
import time
from sagemaker.huggingface import HuggingFace
from sagemaker.estimator import Estimator

job_name = f'huggingface-chatglm-finetune-fulltuning-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

instance_type  = "ml.p4d.24xlarge"
#instance_type  = "ml.g5.48xlarge"
instance_count = 1
processes_per_host = 8

# 基础模型存放地址
model_name_or_path = 's3://{}/llm/models/chatglm2/original-6B/'.format(sagemaker_default_bucket)

# 微调模型输出地址
#output_dir         = '/opt/ml/model/adgen-chatglm2-6b-ft'
output_dir         = '/tmp/model/adgen-chatglm2-6b-ft/'
model_s3_path      = 's3://{}/llm/models/chatglm2/finetune-fulltuning-adgen/'.format(sagemaker_default_bucket)

# 模型环境变量设置
environment = {
    'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:32',
    'NUM_GPUS'               : str(processes_per_host),
    'TRAIN_DATASET'          : '/opt/ml/input/data/KuaFuTrainData/train.json',
    'TEST_DATASET'           : '/opt/ml/input/data/KuaFuTrainData/dev.json',
    'PROMPT_COLUMN'          : 'content',
    'RESPONSE_COLUMN'        : 'summary',
    'MODEL_NAME_OR_PATH'     : model_name_or_path,
    'OUTPUT_DIR'             : output_dir,
    'MODEL_OUTPUT_S3_PATH'   : model_s3_path,
    'TRAIN_STEPS'            : '50'
}

inputs = {
   'KuaFuTrainData': f"s3://{sagemaker_default_bucket}/llm/datasets/chatglm2/KuaFuTrainData/"
}

In [14]:
## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account, region, repo_name)
image_uri

'687912291502.dkr.ecr.us-west-2.amazonaws.com/sagemaker-chatglm2-demo:latest'

In [15]:
# create the Estimator
huggingface_estimator = Estimator(
    entry_point          = 'sm_fulltuning_train.sh',
    source_dir           = './ChatGLM2-6B/ptuning',
    image_uri            = image_uri,
    instance_type        = instance_type,
    instance_count       = instance_count,
    base_job_name        = job_name,
    role                 = role,
    environment          = environment
)

In [16]:
huggingface_estimator.fit(inputs=inputs)

INFO:sagemaker:Creating training-job with name: huggingface-chatglm-finetune-fulltuning-2024-09-28-13-51-23-173


2024-09-28 13:51:25 Starting - Starting the training job
2024-09-28 13:51:25 Pending - Training job waiting for capacity......
2024-09-28 13:52:18 Pending - Preparing the instances for training..............................
2024-09-28 13:57:12 Downloading - Downloading input data...
2024-09-28 13:57:37 Downloading - Downloading the training image.....................
2024-09-28 14:01:19 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-09-28 14:02:11,425 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-09-28 14:02:11,538 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-09-28 14:02:11,545 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-09-28 14:02:11,547 sagemaker_p

## 4.4 部署微调模型

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

endpoint_name         = None
instance_type         = 'ml.g4dn.2xlarge'
instance_count        = 1

predictor = model.deploy(
    endpoint_name          = endpoint_name,
    instance_type          = instance_type, 
    initial_instance_count = instance_count,
    serializer             = JSONSerializer(),
    deserializer           = JSONDeserializer()
)

## 4.5 测试微调模型

In [None]:
# Wait model loading

import time

time.sleep(3)

In [None]:
inputs = {
    "ask": "需求:完成用户的高低肩分析报告|数据信息:性别女 身高174cm 年龄23;体型:标准;高低肩数值: 1.2 可能存在左高"

}

response = predictor.predict(inputs)
print(response["answer"])


In [None]:
inputs = {
    "ask": "告诉我，你是谁？您能干什么？"

}

response = predictor.predict(inputs)
print(response["answer"])


In [None]:
inputs = {
    "ask": "对于ChatGLM的数据微调工作，你有什么想说的吗？"

}

response = predictor.predict(inputs)
print(response["answer"])


In [None]:
inputs = {
    "ask": "需求:完成用户的体型分析报告|数据信息:性别男 身高175cm 年龄35;体型:标准;身体成分数据:体重 71.9kg [60.6-82], 肌肉质量 58.7kg [48.7-59.5], 体脂肪 10.3kg [8.1-16.2], 体脂率 14.3% [10-20], BMI 23.5 [18.5-24], 腰臀比 0.82 [0.8-0.9], 内脏脂肪等级 3 [0.9-10]"
}

response = predictor.predict(inputs)
print(response["answer"])


## 4.6 清除资源

In [None]:
predictor.delete_endpoint()

In [4]:
!pip install huggingface==0.99999

[31mERROR: Could not find a version that satisfies the requirement huggingface==0.99999 (from versions: 0.0.1)[0m[31m
[0m[31mERROR: No matching distribution found for huggingface==0.99999[0m[31m
[0m