#### 1. 导入 boto3, sagemaker python SDK

In [12]:
import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region_name = boto3.session.Session().region_name

images_s3uri = 's3://{0}/dreambooth-xl/images/'.format(bucket)
models_s3uri = 's3://{0}/stable-diffusion/models/'.format(bucket)
dreambooth_s3uri = 's3://{0}/stable-diffusion/dreambooth/'.format(bucket)

#### 2. 构建 xl fine-tuning 以及webui推理的docker 镜像

In [105]:
!aws s3 cp sd_xl_base_1.0_0.9vae.safetensors s3://$bucket/models/sd/

upload: ./sd_xl_base_1.0_0.9vae.safetensors to s3://sagemaker-us-west-2-687912291502/models/sd/sd_xl_base_1.0_0.9vae.safetensors


In [30]:
!mkdir -p sd_xl_finetune_and_inference
!cd sd_xl_finetune_and_inference && git clone https://github.com/huggingface/diffusers
#!rm -rf sd_xl_finetune_and_inference

Cloning into 'diffusers'...
remote: Enumerating objects: 40712, done.[K
remote: Counting objects: 100% (521/521), done.[K
remote: Compressing objects: 100% (281/281), done.[K
remote: Total 40712 (delta 309), reused 360 (delta 193), pack-reused 40191[K
Receiving objects: 100% (40712/40712), 27.09 MiB | 27.60 MiB/s, done.
Resolving deltas: 100% (30104/30104), done.


In [2]:
%%writefile Dockerfile_train_and_inference
## You should change below region code to the region you used, here sample is use us-west-2

From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04

################ stable diffusion webui ##########################
ENV DEBIAN_FRONTEND noninteractive
ENV PATH="/opt/ml/code:${PATH}"
ENV PYTHONPATH="/opt/ml/code"
ENV COMMANDLINE_ARGS="--skip-torch-cuda-test"

# webui dependency packages
RUN apt-get update && \
    apt-get install --assume-yes apt-utils vim wget git libgl1-mesa-glx -y && \
    rm -rf /var/lib/apt/lists/* && \
    pip install \
        opencv-python-headless \
        sagemaker-training \
        boto3==1.26.64 \
        uvicorn \
        sagemaker \
        diffusers==0.14.0 \
        accelerate==0.17.0 \
        controlnet_aux \
        wheel bitsandbytes \
        GPUtil \
        nvidia-ml-py \
        pynvml \
        clip-interrogator==0.6.0 \
        spacy \
        retrying \
        piexif \
        supervision==0.6.0 \
        roboflow \
        sagemaker-ssh-helper \
        chardet



############ dreambooth fine tune #################################
RUN pip install wandb
RUN pip install xformers==0.0.18
RUN pip install bitsandbytes


ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

Overwriting Dockerfile_train_and_inference


* build & push docker镜像

In [3]:
## You should change below region code to the region you used, here sample is use us-west-2
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [4]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [5]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} -f ./Dockerfile_train_and_inference ./
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Sending build context to Docker daemon  7.139GB
Step 1/15 : From 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04
 ---> 1f37d018af76
Step 2/15 : ENV DEBIAN_FRONTEND noninteractive
 ---> Using cache
 ---> 7552e679a5da
Step 3/15 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 3cb22f09900e
Step 4/15 : ENV PYTHONPATH="/opt/ml/code"
 ---> Using cache
 ---> 359dca05dd30
Step 5/15 : ENV COMMANDLINE_ARGS="--skip-torch-cuda-test"
 ---> Using cache
 ---> 0ce3d0d8d484
Step 6/15 : RUN apt-get update &&     apt-get install --assume-yes apt-utils vim wget git libgl1-mesa-glx -y &&     rm -rf /var/lib/apt/lists/* &&     pip install         opencv-python-headless         sagemaker-training         boto3==1.26.64         uvicorn         sagemaker         diffusers==0.14.0         accelerate==0.17.0         controlnet_aux         wheel bitsandbytes         GPUtil         nvidia-ml-py         pynvml   

* 准备训练图像

In [76]:
from huggingface_hub import snapshot_download

local_dir = "./dog"
snapshot_download(
    "diffusers/dog-example",
    local_dir=local_dir, repo_type="dataset",
    ignore_patterns=".gitattributes",
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

'/home/ec2-user/SageMaker/sd_xl/dog'

In [77]:
!mkdir sd_xl_finetune_and_inference && ./cp s5cmd ./sd_xl_finetune_and_inference/
!chmod -R 777 ./sd_xl_finetune_and_inference
!./sd_xl_finetune_and_inference/s5cmd sync ./dog/ $images_s3uri

mkdir: cannot create directory ‘sd_xl_finetune_and_inference’: File exists
cp dog/alvan-nee-bQaAJCbNq3g-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-bQaAJCbNq3g-unsplash.jpeg
cp dog/alvan-nee-9M0tSjb-cpA-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-9M0tSjb-cpA-unsplash.jpeg
cp dog/alvan-nee-eoqnr8ikwFE-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-eoqnr8ikwFE-unsplash.jpeg
cp dog/alvan-nee-Id1DBHv4fbg-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-Id1DBHv4fbg-unsplash.jpeg
cp dog/alvan-nee-brFsZ7qszSY-unsplash.jpeg s3://sagemaker-us-west-2-687912291502/dreambooth-xl/images/alvan-nee-brFsZ7qszSY-unsplash.jpeg


#### 3. 模型微调

   * image_uri: ecr仓库中的 docker 镜像地址
   * instance_type: 用于训练任务的实例大小 , 建议使用 ml.g4dn.xlarge, ml.g5.xlarge
   * class_prompt: 提示词类别
   * instance_prompt: 用于你的图片的关键词
   * model_name: 预训练的模型名称
   

In [15]:
!./s5cmd sync s3://sagemaker-us-west-2-687912291502/models/sd/* /tmp/third-package/models/Stable-diffusion/

In [6]:
%%writefile ./sd_xl_finetune_and_inference/train.sh


mkdir -p /tmp/dog
ls -lt ./
chmod 777 ./s5cmd


cd diffusers && pip install -e .
cd examples/dreambooth/ && pip install -r requirements_sdxl.txt

cp -r /opt/ml/input/data/images/* /tmp/dog/

export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export INSTANCE_DIR="/tmp/dog/"
export OUTPUT_DIR="/tmp/ouput"
#export OUTPUT_DIR="/opt/ml/model/"
export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
export dreambooth_s3uri="s3://sagemaker-us-west-2-687912291502/stable-diffusion/dreambooth/"

accelerate launch /opt/ml/code/diffusers/examples/dreambooth/train_dreambooth_lora_sdxl.py \
  --gradient_checkpointing \
  --use_8bit_adam \
  --pretrained_model_name_or_path=$MODEL_NAME  \
  --instance_data_dir=$INSTANCE_DIR \
  --pretrained_vae_model_name_or_path=$VAE_PATH \
  --output_dir=$OUTPUT_DIR \
  --mixed_precision="fp16" \
  --instance_prompt="a photo of sks dog" \
  --resolution=1024 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --learning_rate=1e-5 \
  --report_to="wandb" \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --max_train_steps=500 \
  --validation_prompt="A photo of sks dog in a bucket" \
  --validation_epochs=25 \
  --seed="0" \
  --enable_xformers_memory_efficient_attention

/opt/ml/code/s5cmd sync /tmp/ouput/ $dreambooth_s3uri/output/$(date +%Y-%m-%d-%H-%M-%S)/

####在shell中拷贝webui需要的模型文件############
aws s3 cp s3://${bucket}/models/sd/sd_xl_base_1.0.safetensors /tmp/third-package/models/Stable-diffusion/
aws s3 cp s3://${bucket}/models/sd/sd_xl_base_1.0_0.9vae.safetensors /tmp/third-package/models/VAE/
cp -R /tmp/ouput/*  /tmp/third-package/models/Stable-diffusion/

Overwriting ./sd_xl_finetune_and_inference/train.sh


In [8]:
%%writefile ./sd_xl_finetune_and_inference/start_sd_webui.py
import subprocess
import os
import time
import requests
import json
import logging
from tenacity import retry, wait_exponential

logging.basicConfig(level=logging.INFO, filename='./webui.log', filemode='a')
logger = logging.getLogger(__name__)

@retry(wait=wait_exponential(multiplier=1, min=10, max=100), stop=stop_after_attempt(5))
def check_server(server_url):
    txt2img_url = server_url + "/sdapi/v1/txt2img"
    data = {
        'prompt': 'A photo of sks dog in a bucket',
        'sampler_index': 'DPM++ SDE',
        'seed': 1234,
        'steps': 20,
        'width': 512,
        'height': 512,
        'cfg_scale': 8
    }
    response = requests.post(txt2img_url, data=json.dumps(data),
                             headers={"Content-Type": "application/json"})
    response.raise_for_status()

def start_stable_diffusion(server_url="http://0.0.0.0:7860", log_file="./webui.log"):
    try:
        with open(log_file, "a") as f:
            process = subprocess.Popen(
                ["python", "/tmp/third-package/launch.py","--port" ,"8080", 
                 "--xformers", "--api", "--listen"],
                stdout=f, stderr=subprocess.STDOUT, preexec_fn=os.setpgrp)
        if process.returncode is not None:
            raise RuntimeError("Failed to start stable diffusion process.")
        time.sleep(100)
        check_server(server_url)
        logger.info("stable diffusion server started.")
        return server_url
    except Exception as error:
        logger.error(f"stable diffusion server failed, {error}")
        raise RuntimeError("Failed to start stable diffusion server or server not responding.")
        
def txt2image():
    server_url = "http://0.0.0.0:7860" 
    max_retries = 5
    retry_count = 0
    while retry_count < max_retries:
      try:
        txt2img_url = server_url + "/sdapi/v1/txt2img"
        
        data = {
         'prompt': 'A photo of sks dog in a bucket',
         'sampler_index': 'DPM++ SDE',
         'seed': 1234,
         'steps': 20,
         'width': 512,
         'height': 512,
         'cfg_scale': 8
        }
  
      response = requests.post(txt2img_url, data=json.dumps(data), headers={"Content-Type": "application/json"})
      if response.status_code == 200:
        log_and_raise("info", "stable diffusion server started.")
        return server_url
      except Exception as error:
        log_and_raise("error", f"stable diffusion server failed, {error}")

start_stable_diffusion()
txt2image()

Overwriting ./sd_xl_finetune_and_inference/start_sd_webui.py


In [9]:
%%writefile ./sd_xl_finetune_and_inference/train_and_inference.sh

sudo chmod -R 777 /opt/ml/code/*
##############sdxl dreambooth finetune#################### 
/opt/ml/code/train.sh 
/opt/ml/code/s5cmd sync /tmp/ouput/ $dreambooth_s3uri/output/$(date +%Y-%m-%d-%H-%M-%S)/

##############webui startup & inference##################
# clone webui code并copy到docker内
mkdir -p /tmp/third-package
chmod 755 /tmp/third-package

git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui /tmp/third-package/
cd /opt/ml/code/  && python start_sd_webui.py 

Overwriting ./sd_xl_finetune_and_inference/train_and_inference.sh


   ### 创建训练及推理一体的任务

In [10]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [None]:
import time
from sagemaker.estimator import Estimator
from sagemaker.pytorch.estimator import PyTorch

environment = {
    'PYTORCH_CUDA_ALLOC_CONF':'max_split_size_mb:32',
    'bucket':bucket
}

## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region_name, repo_name)
base_job_name = 'sd-xl-dreambooth-finetuning-high'
instance_type = 'ml.g5.2xlarge'
inputs = {
    'images': f"s3://{bucket}/dreambooth-xl/images/"
}

estimator = PyTorch(role=role,
                      entry_point='train_and_inference.sh',
                      source_dir='./sd_xl_finetune_and_inference/',
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False,
                      max_run=24*60*60*2)

estimator.fit(inputs)

### 使用ssh helper 调试 trainning job

In [10]:
%%writefile ./sd_xl_finetune_and_inference/setup_ssm.sh
SAGEMAKER_ROLE_ARN=arn:aws:iam::687912291502:role/service-role/AmazonSageMaker-ExecutionRole-20211013T113123
ACCOUNT_ID=687912291502
REGION=us-west-2

pip install 'sagemaker-ssh-helper[cdk]'
cdk bootstrap aws://"$ACCOUNT_ID"/"$REGION"
APP="python -m sagemaker_ssh_helper.cdk.iam_ssm_app"
AWS_REGION="$REGION" cdk -a "$APP" deploy SSH-IAM-SSM-Stack \
  -c sagemaker_role="$SAGEMAKER_ROLE_ARN" \
  -c user_role="$USER_ROLE_ARN"
APP="python -m sagemaker_ssh_helper.cdk.advanced_tier_app"
AWS_REGION="$REGION" cdk -a "$APP" deploy SSM-Advanced-Tier-Stack

Writing ./sd_xl_finetune_and_inference/setup_ssm.sh


In [13]:
%%writefile ./sd_xl_finetune_and_inference/train_and_inference.py
import sagemaker_ssh_helper
sagemaker_ssh_helper.setup_and_start_ssh()

import time
import os
import json
import socket
start_time = time.time()

if __name__ == "__main__":
    hosts = json.loads(os.environ['SM_HOSTS'])
    current_host = os.environ['SM_CURRENT_HOST']

    while True:
       current_time = time.time()
       if current_time - start_time >= 1200:
           break
    os.system("chmod +x ./s5cmd")
    os.system("/bin/bash train_and_inference.sh")

Overwriting ./sd_xl_finetune_and_inference/train_and_inference.py


In [8]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sd_xl_finetuning_and_inference"

In [None]:
import time
from sagemaker.estimator import Estimator
from sagemaker.pytorch.estimator import PyTorch
from sagemaker_ssh_helper.wrapper import SSHEstimatorWrapper

environment = {
    'PYTORCH_CUDA_ALLOC_CONF':'max_split_size_mb:32'
}

## The image uri which is build and pushed above
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account_id, region_name, repo_name)
base_job_name = 'sd-xl-dreambooth-finetuning-high'
instance_type = 'ml.g5.4xlarge'
inputs = {
    'images': f"s3://{bucket}/dreambooth-xl/images/"
}

estimator = PyTorch(role=role,
                      entry_point='train_and_inference.py',
                      source_dir='./sd_xl_finetune_and_inference/',
                      dependencies=[SSHEstimatorWrapper.dependency_dir()],
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False,
                      max_run=24*60*60*2)

ssh_wrapper = SSHEstimatorWrapper.create(estimator, connection_wait_time_seconds=600)  # <--NEW--
estimator.fit(inputs,wait=False)
print(f"To connect over SSH run: sm-local-ssh-training connect {ssh_wrapper.training_job_name()}")
instance_ids = ssh_wrapper.get_instance_ids(timeout_in_sec=900)  # <--NEW-- 
print(f"To connect over SSM run: aws ssm start-session --target {instance_ids[0]}")

In [66]:
print("Model artifact saved at:\n", dreambooth_s3uri)

Model artifact saved at:
 s3://sagemaker-us-west-2-687912291502/stable-diffusion/dreambooth/
