## 1. 导入 boto3, sagemaker python SDK

In [None]:
!pip install huggingface_hub

In [28]:
import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

images_s3uri = 's3://{0}/dreambooth-xl/images/'.format(bucket)
models_s3uri = 's3://{0}/stable-diffusion/models/'.format(bucket)
dreambooth_s3uri = 's3://{0}/stable-diffusion/dreambooth/'.format(bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### 2. 构建 xl fine-tuning 以及webui推理的docker 镜像

In [29]:
!mkdir -p sd_xl_finetune_and_inference
!cd sd_xl_finetune_and_inference && git clone https://github.com/huggingface/diffusers

fatal: destination path 'diffusers' already exists and is not an empty directory.


In [30]:
%%writefile Dockerfile_train_and_inference
## You should change below region code to the region you used, here sample is use us-west-2

From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04

################ stable diffusion webui ##########################
ENV DEBIAN_FRONTEND noninteractive
ENV PATH="/opt/ml/code:${PATH}"
ENV PYTHONPATH="/opt/ml/code"
ENV COMMANDLINE_ARGS="--skip-torch-cuda-test"

# webui dependency packages
RUN apt-get update && \
    apt-get install --assume-yes apt-utils vim wget git libgl1-mesa-glx -y && \
    rm -rf /var/lib/apt/lists/* && \
    pip install \
        opencv-python-headless \
        sagemaker-training \
        boto3 \
        uvicorn \
        sagemaker \
        diffusers==0.14.0 \
        accelerate==0.17.0 \
        controlnet_aux \
        wheel bitsandbytes \
        GPUtil \
        nvidia-ml-py \
        pynvml \
        clip-interrogator==0.6.0 \
        spacy \
        retrying \
        piexif \
        supervision==0.6.0 \
        roboflow \
        sagemaker-ssh-helper \
        chardet



############ dreambooth fine tune #################################
RUN pip install wandb
  ##############xforms0.0.21以上，默认开启flash attention v2######
RUN pip install xformers==0.0.21 --no-deps
#RUN echo "Y"|pip uninstall torchvision
#RUN pip install -U torchvision
RUN pip install bitsandbytes
  #############sagemaker 训练镜像torch版本较低（2.0.0），可以考虑升级到最新preview版本
RUN echo "Y"|pip uninstall torch
RUN echo "Y"|pip uninstall torchvision
RUN pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118


ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

Overwriting Dockerfile_train_and_inference


* build & push docker镜像

In [31]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [32]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [33]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} -f ./Dockerfile_train_and_inference ./
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Sending build context to Docker daemon  144.4MB
Step 1/12 : From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04
 ---> 1f37d018af76
Step 2/12 : ENV DEBIAN_FRONTEND noninteractive
 ---> Using cache
 ---> d71b63870aab
Step 3/12 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> dd100dd013bf
Step 4/12 : ENV PYTHONPATH="/opt/ml/code"
 ---> Using cache
 ---> fb39f021ffa0
Step 5/12 : ENV COMMANDLINE_ARGS="--skip-torch-cuda-test"
 ---> Using cache
 ---> 91be63bd8408
Step 6/12 : RUN apt-get update &&     apt-get install --assume-yes apt-utils vim wget git libgl1-mesa-glx -y &&     rm -rf /var/lib/apt/lists/* &&     pip install         opencv-python-headless         sagemaker-training         boto3         uvicorn         sagemaker         diffusers==0.14.0         accelerate==0.17.0         controlnet_aux         wheel bitsandbytes         GPUtil         nvidia-ml-py         pynvml         cli

* 准备训练图像

In [34]:
from huggingface_hub import snapshot_download

local_dir = "./dog"
snapshot_download(
    "diffusers/dog-example",
    local_dir=local_dir, repo_type="dataset",
    ignore_patterns=".gitattributes",
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

'/home/ec2-user/SageMaker/sd/dog'

In [35]:
!mkdir sd_xl_finetune_and_inference && ./cp s5cmd ./sd_xl_finetune_and_inference/
!chmod -R 777 ./sd_xl_finetune_and_inference
!./sd_xl_finetune_and_inference/s5cmd sync ./dog/ $images_s3uri

mkdir: cannot create directory ‘sd_xl_finetune_and_inference’: File exists
cp dog/alvan-nee-9M0tSjb-cpA-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-9M0tSjb-cpA-unsplash.jpeg
cp dog/alvan-nee-Id1DBHv4fbg-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-Id1DBHv4fbg-unsplash.jpeg
cp dog/alvan-nee-brFsZ7qszSY-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-brFsZ7qszSY-unsplash.jpeg
cp dog/alvan-nee-eoqnr8ikwFE-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-eoqnr8ikwFE-unsplash.jpeg
cp dog/alvan-nee-bQaAJCbNq3g-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-bQaAJCbNq3g-unsplash.jpeg


#### 3. 模型微调

   * image_uri: ecr仓库中的 docker 镜像地址
   * instance_type: 用于训练任务的实例大小 , 建议使用 ml.g4dn.xlarge, ml.g5.xlarge
   * class_prompt: 提示词类别
   * instance_prompt: 用于你的图片的关键词
   * model_name: 预训练的模型名称
   

In [41]:
%%writefile ./sd_xl_finetune_and_inference/train.sh

export WANDB_PROJECT="finetune_and_inference" 
export WANDB_API_KEY="298b59ce8a416fd45b5fa9ffc17fe72327854e0c"
export WANDB_WATCH="all"
export WANDB_ENTITY="121102723"

mkdir -p /tmp/dog
mkdir -p /tmp/output

chmod 777 ./s5cmd


cd diffusers && pip install -e .
cd examples/dreambooth/ && pip install -r requirements_sdxl.txt

cp -r /opt/ml/input/data/images/* /tmp/dog/
ls -lt /tmp/dog/

export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export INSTANCE_DIR="/tmp/dog/"
export OUTPUT_DIR="/tmp/ouput/"
#export OUTPUT_DIR="/opt/ml/model/"
export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
export dreambooth_s3uri="s3://sagemaker-us-west-2-687912291502/stable-diffusion/dreambooth/"

accelerate launch train_dreambooth_lora_sdxl.py \
  --gradient_checkpointing \
  --use_8bit_adam \
  --pretrained_model_name_or_path=$MODEL_NAME  \
  --instance_data_dir=$INSTANCE_DIR \
  --pretrained_vae_model_name_or_path=$VAE_PATH \
  --output_dir=$OUTPUT_DIR \
  --instance_prompt="a photo of sks dog" \
  --resolution=512 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=2 \
  --learning_rate=1e-5 \
  --report_to="wandb" \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --max_train_steps=100 \
  --checkpointing_steps=100 \
  --validation_prompt="A photo of sks dog in a bucket" \
  --validation_epochs=1 \
  --seed="0" \
  --mixed_precision="fp16"
  #--enable_xformers_memory_efficient_attention
echo "train finished!"

ls -lt /tmp/output/
/opt/ml/code/s5cmd sync /tmp/ouput/ $dreambooth_s3uri/output/$(date +%Y-%m-%d-%H-%M-%S)/
echo "upload finished!"

Overwriting ./sd_xl_finetune_and_inference/train.sh


In [42]:
%%writefile ./sd_xl_finetune_and_inference/start_sd_webui.py
import subprocess
import os
import time
import requests
import json
import logging
import io
import base64
from PIL import Image, PngImagePlugin
from tenacity import retry, wait_exponential

logging.basicConfig(level=logging.INFO, filename='./webui.log', filemode='a')
logger = logging.getLogger(__name__)

@retry(wait=wait_exponential(multiplier=1, min=10, max=100))
def check_server(server_url):
    txt2img_url = server_url + "/sdapi/v1/txt2img"
    data = {
        'prompt': 'A photo of sks dog in a bucket',
        'sampler_index': 'DPM++ SDE',
        'seed': 1234,
        'steps': 20,
        'width': 512,
        'height': 512,
        'cfg_scale': 8
    }
    response = requests.post(txt2img_url, data=json.dumps(data),
                             headers={"Content-Type": "application/json"})
    response.raise_for_status()

def start_stable_diffusion(server_url="http://0.0.0.0:7860", log_file="./webui.log"):
    try:
        with open(log_file, "a") as f:
            process = subprocess.Popen(
                ["python", "/tmp/third-package/launch.py","--port" ,"7860", 
                 "--opt-sdp-attention", "--api", "--listen"],
                stdout=f, stderr=subprocess.STDOUT, preexec_fn=os.setpgrp)
        if process.returncode is not None:
            raise RuntimeError("Failed to start stable diffusion process.")
        time.sleep(100)
        check_server(server_url)
        logger.info("stable diffusion server started.")
        return server_url
    except Exception as error:
        logger.error(f"stable diffusion server failed, {error}")
        raise RuntimeError("Failed to start stable diffusion server or server not responding.")
        
def txt2image():
    server_url = "http://0.0.0.0:7860" 
    try:
        txt2img_url = server_url + "/sdapi/v1/txt2img"
        
        data = {
         'prompt': prompt,
         'sampler_index': 'DPM++ SDE',
         'seed': 1234,
         'steps': 40,
         'width': 512,
         'height': 512,
         'cfg_scale': 8
        }
  
        response = requests.post(txt2img_url, data=json.dumps(data), headers={"Content-Type": "application/json"})
        if response.status_code == 200:
            logger.info("stable diffusion server inference successed.")
            return response
    except Exception as error:
        logger.error(f"stable diffusion server inference failed, {error}")
        return None

start_stable_diffusion()
print("start webui success!")
r=txt2image().json()
print("test inference success!")
count=0
for i in r['images']:
    image = Image.open(io.BytesIO(base64.b64decode(i.split(",",1)[0])))
    count=count+1
    # 保存图像文件
    img.save("/tmp/output/images/"+str(count)+".png")


Overwriting ./sd_xl_finetune_and_inference/start_sd_webui.py


In [43]:
%%writefile ./sd_xl_finetune_and_inference/train_and_inference.sh

chmod -R 777 /opt/ml/code/*
##############sdxl dreambooth finetune#################### 
/opt/ml/code/train.sh


##############webui startup & inference##################
# clone webui code
mkdir -p /tmp/third-package
chmod -R 777 /tmp/third-package
git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui /tmp/third-package/
pip install -r /tmp/third-package/requirements.txt

# copy fine tuned model
aws s3 cp s3://${bucket}/models/sd/sd_xl_base_1.0.safetensors /tmp/third-package/models/Stable-diffusion/
aws s3 cp s3://${bucket}/models/sd/sd_xl_base_1.0_0.9vae.safetensors /tmp/third-package/models/VAE/
cp -r /tmp/ouput/ /tmp/third-package/models/Lora/

# start inference and get output 
cd /opt/ml/code/  && python start_sd_webui.py 
/opt/ml/code/s5cmd sync /tmp/output/images/ s3://sagemaker-us-west-2-687912291502/stable-diffusion/dreambooth/output/

Overwriting ./sd_xl_finetune_and_inference/train_and_inference.sh


   ### 创建训练及推理一体的任务

In [44]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [None]:
import time
from sagemaker.estimator import Estimator
from sagemaker.pytorch.estimator import PyTorch

environment = {
    'PYTORCH_CUDA_ALLOC_CONF':'max_split_size_mb:32',
    'bucket':bucket
}

## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region_name, repo_name)
base_job_name = 'sd-xl-dreambooth-finetuning-inference'
instance_type = 'ml.g5.2xlarge'
inputs = {
    'images': f"s3://{bucket}/dreambooth-xl/images/"
}

estimator = PyTorch(role=role,
                      entry_point='train_and_inference.sh',
                      source_dir='./sd_xl_finetune_and_inference/',
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False,
                      max_run=24*60*60*2)

estimator.fit(inputs)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sd-xl-dreambooth-finetuning-inference-2023-11-07-09-16-13-680


2023-11-07 09:16:20 Starting - Starting the training job...
2023-11-07 09:16:36 Starting - Preparing the instances for training......
2023-11-07 09:17:43 Downloading - Downloading input data...
2023-11-07 09:18:09 Training - Downloading the training image.................................
2023-11-07 09:23:30 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-11-07 09:24:07,781 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-11-07 09:24:07,795 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-11-07 09:24:07,803 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-11-07 09:24:07,805 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-11-

### 使用ssh helper 调试 trainning job

In [None]:
!pip install sagemaker-ssh-helper

In [None]:
%%writefile ./sd_xl_finetune_and_inference/setup_ssm.sh
SAGEMAKER_ROLE_ARN=arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
ACCOUNT_ID=687912291502
REGION=us-west-2

pip install 'sagemaker-ssh-helper[cdk]'
cdk bootstrap aws://"$ACCOUNT_ID"/"$REGION"
APP="python -m sagemaker_ssh_helper.cdk.iam_ssm_app"
AWS_REGION="$REGION" cdk -a "$APP" deploy SSH-IAM-SSM-Stack \
  -c sagemaker_role="$SAGEMAKER_ROLE_ARN" \
  -c user_role="$USER_ROLE_ARN"
APP="python -m sagemaker_ssh_helper.cdk.advanced_tier_app"
AWS_REGION="$REGION" cdk -a "$APP" deploy SSM-Advanced-Tier-Stack

In [12]:
%%writefile ./sd_xl_finetune_and_inference/train_and_inference.py
import sagemaker_ssh_helper
sagemaker_ssh_helper.setup_and_start_ssh()

import time
import os
import json
import socket
start_time = time.time()

if __name__ == "__main__":
    hosts = json.loads(os.environ['SM_HOSTS'])
    current_host = os.environ['SM_CURRENT_HOST']

    while True:
       current_time = time.time()
       if current_time - start_time >= 1200:
           break
    os.system("chmod +x ./s5cmd")
    os.system("/bin/bash train_and_inference.sh")

Overwriting ./sd_xl_finetune_and_inference/train_and_inference.py


In [13]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [20]:
import time
from sagemaker.estimator import Estimator
from sagemaker.pytorch.estimator import PyTorch
from sagemaker_ssh_helper.wrapper import SSHEstimatorWrapper

environment = {
    'PYTORCH_CUDA_ALLOC_CONF':'max_split_size_mb:32'
}

## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region_name, repo_name)
base_job_name = 'sd-xl-dreambooth-finetuning-high'
instance_type = 'ml.g5.4xlarge'
inputs = {
    'images': f"s3://{bucket}/dreambooth-xl/images/"
}

estimator = PyTorch(role=role,
                      entry_point='train_and_inference.py',
                      source_dir='./sd_xl_finetune_and_inference/',
                      dependencies=[SSHEstimatorWrapper.dependency_dir()],
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False,
                      max_run=24*60*60*2)

ssh_wrapper = SSHEstimatorWrapper.create(estimator, connection_wait_time_seconds=600)  # <--NEW--
estimator.fit(inputs,wait=False)
print(f"To connect over SSH run: sm-local-ssh-training connect {ssh_wrapper.training_job_name()}")
instance_ids = ssh_wrapper.get_instance_ids(timeout_in_sec=900)  # <--NEW-- 
print(f"To connect over SSM run: aws ssm start-session --target {instance_ids[0]}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker-ssh-helper:Turning on SSH to training job for estimator <class 'sagemaker.pytorch.estimator.PyTorch'>
INFO:sagemaker-ssh-helper:Passing 'AID**************D337' as a value of the SSHOwner tag of an SSM managed instance


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sd-xl-dreambooth-finetuning-high-2023-11-06-07-26-05-518
INFO:sagemaker-ssh-helper:Resolving training instance IDs through SSM tags
INFO:sagemaker-ssh-helper:Remote training logs are at https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs$3FlogStreamNameFilter$3Dsd-xl-dreambooth-finetuning-high-2023-11-06-07-26-05-518$252F
INFO:sagemaker-ssh-helper:Estimator metadata is at https://us-west-2.console.aws.amazon.com/sagemaker/home?region=us-west-2#/jobs/sd-xl-dreambooth-finetuning-high-2023-11-06-07-26-05-518
INFO:sagemaker-ssh-helper:SSMManager:Querying SSM instance IDs for training job sd-xl-dreambooth-finetuning-high-2023-11-06-07-26-05-518, expected instance count = 1
INFO:sagemaker-ssh-helper:SSMManager:Using AWS Region: us-west-2


To connect over SSH run: sm-local-ssh-training connect sd-xl-dreambooth-finetuning-high-2023-11-06-07-26-05-518


INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 900
INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 890
INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 880
INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 870
INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 860
INFO:sagemaker-ssh-helper:SSMManager:No instance IDs found. Retrying. Is SSM Agent running on the remote? Check the remote logs. Seconds left before time out: 850
INFO:sagemaker-ssh-hel

To connect over SSM run: aws ssm start-session --target mi-041536d96547c133e


In [None]:
print("Model artifact saved at:\n", dreambooth_s3uri)