## initial sagemaker env

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name
images_s3uri = 's3://{0}/hunyuan-lora-train/dataset/'.format(bucket)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [10]:
import os
train_image_dir = "./images"
docker_file_dir = "./dockerfile"
os.makedirs(train_image_dir, exist_ok=True)
os.makedirs(docker_file_dir, exist_ok=True)

## Prepare training datasets and Dockerfile(docker image for training job)

#### 准备你自己的png/txt pair 打标文件放到images路径下

In [11]:
%%writefile ./dataset_clear.sh
#如果是视频连续帧的打标，最好随机drop一些catiion：


percentage=30 ## drop百分比
directory="./images"  # 默认在当前目录执行，你可以修改为其他目录

# 获取所有的png文件
png_files=($(find "$directory" -maxdepth 1 -type f -name "*.png"))

# 计算需要删除的文件数量
num_files=${#png_files[@]}
num_to_delete=$((num_files * percentage / 100))

if [ $num_to_delete -eq 0 ]; then
    echo "No files to delete with the given percentage."
    exit 0
fi

# 随机选择要删除的文件
files_to_delete=($(shuf -n $num_to_delete -e "${png_files[@]}"))

# 删除选中的png文件和对应的txt文件
for png_file in "${files_to_delete[@]}"; do
    # 提取文件名（不包含扩展名）
    file_name=$(basename "$png_file" .png)
    
    # 构造对应的txt文件名
    txt_file="${file_name}.txt"
    
    # 删除文件
    rm "$directory/$png_file"
    rm "$directory/$txt_file"
    
    echo "Deleted: $png_file and $txt_file"
done

echo "Deletion completed."

Overwriting ./dataset_clear.sh


In [12]:
%%writefile ./images/dataset.toml
# resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
# otherwise, the default values will be used for each item

# general configurations
[general]
resolution = [720,1280]
caption_extension = ".txt"
batch_size = 1
enable_bucket = true
bucket_no_upscale = false

[[datasets]]
image_directory = "/opt/ml/input/data/lora_hunyuan/"
cache_directory = "/opt/ml/input/data/lora_hunyuan/"
num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.


Overwriting ./images/dataset.toml


## prepare train bootstrap scripts

In [19]:
%%writefile ./train_hunyuan_lora.sh

## upgrade lib
cd /tmp/ && git clone https://github.com/thu-ml/SageAttention.git && cd SageAttention && pip install -e .

## download models
mkdir -p /tmp/models && cd /tmp/models/
git clone https://github.com/Tencent/HunyuanVideo.git
huggingface-cli download tencent/HunyuanVideo --local-dir /tmp/models/hunyuan_ckpts

#### llava-llama
cd /tmp/models/hunyuan_ckpts
huggingface-cli download xtuner/llava-llama-3-8b-v1_1-transformers --local-dir ./llava-llama-3-8b-v1_1-transformers

#### 只需要llava-llama的text encoder
cd /tmp/models/hunyuan_ckpts
python /tmp/models/HunyuanVideo/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py \
       --input_dir /tmp/models/hunyuan_ckpts/llava-llama-3-8b-v1_1-transformers \
       --output_dir /tmp/models/hunyuan_ckpts/text_encoder


cd /tmp/models/hunyuan_ckpts
huggingface-cli download openai/clip-vit-large-patch14 --local-dir /tmp/models/hunyuan_ckpts/text_encoder_2

#### vae
mkdir -p /tmp/models/hunyuan_ckpts/vae \
         && cd /tmp/models/hunyuan_ckpts/vae/ \
         && wget https://huggingface.co/tencent/HunyuanVideo/resolve/main/hunyuan-video-t2v-720p/vae/pytorch_model.pt


##cache captions & image latents
cd /opt/ml/code/
python cache_latents.py --dataset_config /opt/ml/input/data/lora_hunyuan/dataset.toml \
                        --vae /tmp/models/hunyuan_ckpts/vae/pytorch_model.pt \
                        --vae_chunk_size 32 --vae_tiling


python cache_text_encoder_outputs.py --dataset_config /opt/ml/input/data/lora_hunyuan/dataset.toml  \
                        --text_encoder1 /tmp/models/hunyuan_ckpts/text_encoder \
                        --text_encoder2 /tmp/models/hunyuan_ckpts/text_encoder_2 \
                        --batch_size 16

## start train
accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py \
    --dit /tmp/models/hunyuan_ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states_fp8.pt \
    --dataset_config /opt/ml/input/data/lora_hunyuan/dataset.toml --sage_attn --split_attn --mixed_precision bf16 --fp8_base \
    --optimizer_type adamw --learning_rate 6e-4 --lr_scheduler cosine_with_restarts  --gradient_checkpointing  \
    --max_data_loader_n_workers 4 --persistent_data_loader_workers  \
    --network_module networks.lora --network_dim 32 --network_alpha 32 \
    --timestep_sampling sigmoid --discrete_flow_shift 1 \
    --max_train_epochs 300 --save_every_n_epochs 100 --seed 0 \
    --output_dir /opt/ml/model/lora_hunyuan --output_name hunyuan-lora

Overwriting ./train_hunyuan_lora.sh


## Prepare docker image

In [20]:
%%writefile ./Dockerfile
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.6.0-gpu-py312-cu126-ubuntu22.04-sagemaker

ENV PATH="/opt/ml/code:${PATH}"
ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
ENV DEBIAN_FRONTEND noninteractive

RUN git clone https://github.com/kohya-ss/musubi-tuner /opt/ml/code
RUN pip install --upgrade huggingface_hub

WORKDIR /opt/ml/code

COPY ./train_hunyuan_lora.sh /opt/ml/code/train_hunyuan_lora.sh
RUN pip install -r requirements.txt
RUN pip install wandb

RUN pip install -U --force-reinstall torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 
#RUN git clone https://github.com/thu-ml/SageAttention.git && cd SageAttention && pip install -e .
#RUN git clone -b v2.0.1 https://github.com/Dao-AILab/flash-attention.git&&cd flash-attention&&python setup.py install

Overwriting ./Dockerfile


## Build docker image and push to ECR

In [21]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [22]:
%%sh
algorithm_name=hunyuan-lora-taining-job

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

#load public ECR image
#aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws

# Log into Docker
pwd=$(aws ecr get-login-password --region ${region})
docker login --username AWS -p ${pwd} ${account}.dkr.ecr.${region}.amazonaws.com

docker build -t ${algorithm_name} ./ -f ./Dockerfile
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Sending build context to Docker daemon  6.885MB
Step 1/11 : FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.6.0-gpu-py312-cu126-ubuntu22.04-sagemaker
 ---> d540f57b9239
Step 2/11 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 25ca7b5f09e5
Step 3/11 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Using cache
 ---> 709fe84f7a93
Step 4/11 : ENV DEBIAN_FRONTEND noninteractive
 ---> Using cache
 ---> 0fe43d2f5952
Step 5/11 : RUN git clone https://github.com/kohya-ss/musubi-tuner /opt/ml/code
 ---> Using cache
 ---> 73745f290baf
Step 6/11 : RUN pip install --upgrade huggingface_hub
 ---> Using cache
 ---> f94c8a8f4988
Step 7/11 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 05e07fc6ec4f
Step 8/11 : COPY ./train_hunyuan_lora.sh /opt/ml/code/train_hunyuan_lora.sh
 ---> 8b5cd2255e2c
Step 9/11 : RUN pip install -r requirements.txt
 ---> Running in 16fbb888279f
Collecting accelerate==1.6.0 (from -r requirements.txt (line 1))
  Downloading accele

## Train models with SageMaker training job

In [23]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name
images_s3uri = 's3://{0}/hunyuan-lora-train/dataset/'.format(bucket)

In [24]:
# Copy training dataset to S3 bucket
!aws s3 cp images $images_s3uri --recursive

upload: images/.ipynb_checkpoints/2-checkpoint.txt to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/.ipynb_checkpoints/2-checkpoint.txt
upload: images/.ipynb_checkpoints/3-checkpoint.txt to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/.ipynb_checkpoints/3-checkpoint.txt
upload: images/.ipynb_checkpoints/4-checkpoint.txt to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/.ipynb_checkpoints/4-checkpoint.txt
upload: images/2.txt to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/2.txt
upload: images/5.txt to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/5.txt
upload: images/.ipynb_checkpoints/4-checkpoint.jpg to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/.ipynb_checkpoints/4-checkpoint.jpg
upload: images/.ipynb_checkpoints/dataset-checkpoint.toml to s3://sagemaker-us-west-2-687912291502/hunyuan-lora-train/dataset/.ipynb_checkpoints/dataset-checkpoint.toml
upload: images/4.jpg 

***You need to provide your own "wandb_api_key" for below scripts***

In [25]:
docker_image_uri = '{0}.dkr.ecr.{1}.amazonaws.com/hunyuan-lora-taining-job'.format(account_id, region_name)
instance_type = 'ml.g6e.xlarge'

environment = {'LD_LIBRARY_PATH': "${LD_LIBRARY_PATH}:/opt/conda/lib/python3.11/site-packages/nvidia/nvjitlink/lib/"}


In [26]:
docker_image_uri

'687912291502.dkr.ecr.us-west-2.amazonaws.com/hunyuan-lora-taining-job'

In [27]:
from sagemaker.estimator import Estimator

inputs = {
    'lora_hunyuan': images_s3uri,
}

estimator = Estimator(
    entry_point="train_hunyuan_lora.sh",
    role = role,
    instance_count=1,
    instance_type = instance_type,
    image_uri = docker_image_uri,
    environment=environment,
    disable_output_compression = True,
    sagemaker_session=sagemaker_session,
    enable_remote_debug=True,
    
)
estimator.fit(inputs=inputs,wait=True)

2025-06-03 04:58:54 Starting - Starting the training job
2025-06-03 04:58:54 Pending - Training job waiting for capacity..................
2025-06-03 05:01:31 Pending - Preparing the instances for training...
2025-06-03 05:02:15 Downloading - Downloading the training image.................................
2025-06-03 05:07:42 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 560.35.05[0m
[34mCurrent installed NVIDIA driver version is 550.163.01[0m
[34mAdding CUDA compat to LD_LIBRARY_PATH[0m
[34m/usr/local/cuda/compat:${LD_LIBRARY_PATH}:/opt/conda/lib/python3.11/site-packages/nvidia/nvjitlink/lib/[0m
[34msed: can't read changehostname.c: No such file or directory[0m
[34mcc1: fatal error: changehostname.c: No such file or directory[0m
[34mcompilation term

In [None]:
### remote debug连接training job ssm容器实例
import time
job_name="lora-lora-taining-job-"+str(int(time.time()))
training_job_info = sagemaker_session.describe_training_job(job_name)
print(training_job_info)
#!aws ssm start-session --target sagemaker-training-job:${job_name}_algo-1

In [None]:
model_data = estimator.model_data
model_s3_path = model_data['S3DataSource']['S3Uri']
print("Model artifact saved at:", "\n"+model_s3_path+"\n")
!aws s3 ls {model_s3_path}