From ab6462d095fe2305fd5a8e6ab9ac794662880a99 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 11:53:13 -0600 Subject: [PATCH 1/8] feat(api): enable optimizations for SD pipelines based on env vars (#155) --- .../chain/upscale_stable_diffusion.py | 3 ++ api/onnx_web/diffusion/load.py | 29 +++++++++++++++++++ api/onnx_web/utils.py | 3 ++ 3 files changed, 35 insertions(+) diff --git a/api/onnx_web/chain/upscale_stable_diffusion.py b/api/onnx_web/chain/upscale_stable_diffusion.py index c2032bfbc..ffdb00363 100644 --- a/api/onnx_web/chain/upscale_stable_diffusion.py +++ b/api/onnx_web/chain/upscale_stable_diffusion.py @@ -5,6 +5,7 @@ from diffusers import StableDiffusionUpscalePipeline from PIL import Image +from ..diffusion.load import optimize_pipeline from ..diffusion.pipeline_onnx_stable_diffusion_upscale import ( OnnxStableDiffusionUpscalePipeline, ) @@ -52,6 +53,8 @@ def load_stable_diffusion( if not server.show_progress: pipe.set_progress_bar_config(disable=True) + optimize_pipeline(server, pipe) + server.cache.set("diffusion", cache_key, pipe) run_gc([device]) diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index dceb509f2..53cd6af06 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -17,6 +17,7 @@ KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, + StableDiffusionPipeline, ) try: @@ -87,6 +88,32 @@ def get_tile_latents( return full_latents[:, :, y:yt, x:xt] +def optimize_pipeline( + server: ServerContext, + pipe: StableDiffusionPipeline, +) -> None: + if "attention-slicing" in server.optimizations: + logger.debug("enabling attention slicing on SD pipeline") + pipe.enable_attention_slicing() + + if "vae-slicing" in server.optimizations: + logger.debug("enabling VAE slicing on SD pipeline") + pipe.enable_vae_slicing() + + if "sequential-cpu-offload" in server.optimizations: + logger.debug("enabling sequential CPU offload on SD pipeline") + pipe.enable_sequential_cpu_offload() + elif "model-cpu-offload" in server.optimizations: + # TODO: check for accelerate + logger.debug("enabling model CPU offload on SD pipeline") + pipe.enable_model_cpu_offload() + + if "memory-efficient-attention" in server.optimizations: + # TODO: check for xformers + logger.debug("enabling memory efficient attention for SD pipeline") + pipe.enable_xformers_memory_efficient_attention() + + def load_pipeline( server: ServerContext, pipeline: DiffusionPipeline, @@ -151,6 +178,8 @@ def load_pipeline( if not server.show_progress: pipe.set_progress_bar_config(disable=True) + optimize_pipeline(server, pipe) + if device is not None and hasattr(pipe, "to"): pipe = pipe.to(device.torch_str()) diff --git a/api/onnx_web/utils.py b/api/onnx_web/utils.py index 598606390..20d9d32c3 100644 --- a/api/onnx_web/utils.py +++ b/api/onnx_web/utils.py @@ -28,6 +28,7 @@ def __init__( cache: ModelCache = None, cache_path: str = None, show_progress: bool = True, + optimizations: List[str] = [], ) -> None: self.bundle_path = bundle_path self.model_path = model_path @@ -42,6 +43,7 @@ def __init__( self.cache = cache or ModelCache(num_workers) self.cache_path = cache_path or path.join(model_path, ".cache") self.show_progress = show_progress + self.optimizations = optimizations @classmethod def from_environ(cls): @@ -64,6 +66,7 @@ def from_environ(cls): image_format=environ.get("ONNX_WEB_IMAGE_FORMAT", "png"), cache=ModelCache(limit=cache_limit), show_progress=get_boolean(environ, "ONNX_WEB_SHOW_PROGRESS", True), + optimizations=environ.get("ONNX_WEB_OPTIMIZATIONS", "").split(","), ) From 118695d68cd7cc37df1de66ffe66c0f83dc6c819 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 11:57:18 -0600 Subject: [PATCH 2/8] fix(api): add error handling for optimizations --- api/onnx_web/diffusion/load.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index 53cd6af06..38afb0abb 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -94,24 +94,41 @@ def optimize_pipeline( ) -> None: if "attention-slicing" in server.optimizations: logger.debug("enabling attention slicing on SD pipeline") - pipe.enable_attention_slicing() + try: + pipe.enable_attention_slicing() + except Exception as e: + logger.warning("error enabling attention slicing: %s", e) if "vae-slicing" in server.optimizations: logger.debug("enabling VAE slicing on SD pipeline") - pipe.enable_vae_slicing() + try: + pipe.enable_vae_slicing() + except Exception as e: + logger.warning("error enabling VAE slicing: %s", e) if "sequential-cpu-offload" in server.optimizations: logger.debug("enabling sequential CPU offload on SD pipeline") - pipe.enable_sequential_cpu_offload() + try: + pipe.enable_sequential_cpu_offload() + except Exception as e: + logger.warning("error enabling sequential CPU offload: %s", e) + elif "model-cpu-offload" in server.optimizations: # TODO: check for accelerate logger.debug("enabling model CPU offload on SD pipeline") - pipe.enable_model_cpu_offload() + try: + pipe.enable_model_cpu_offload() + except Exception as e: + logger.warning("error enabling model CPU offload: %s", e) + if "memory-efficient-attention" in server.optimizations: # TODO: check for xformers logger.debug("enabling memory efficient attention for SD pipeline") - pipe.enable_xformers_memory_efficient_attention() + try: + pipe.enable_xformers_memory_efficient_attention() + except Exception as e: + logger.warning("error enabling memory efficient attention: %s", e) def load_pipeline( From f534fbb92cf7b83c0d3de614fb24de35d99a3041 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 11:59:39 -0600 Subject: [PATCH 3/8] fix(api): restore separate upscale and correction stages --- api/onnx_web/diffusion/load.py | 10 +++++----- api/onnx_web/server/upscale.py | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index 38afb0abb..1d8a97d30 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -97,21 +97,21 @@ def optimize_pipeline( try: pipe.enable_attention_slicing() except Exception as e: - logger.warning("error enabling attention slicing: %s", e) + logger.warning("error while enabling attention slicing: %s", e) if "vae-slicing" in server.optimizations: logger.debug("enabling VAE slicing on SD pipeline") try: pipe.enable_vae_slicing() except Exception as e: - logger.warning("error enabling VAE slicing: %s", e) + logger.warning("error while enabling VAE slicing: %s", e) if "sequential-cpu-offload" in server.optimizations: logger.debug("enabling sequential CPU offload on SD pipeline") try: pipe.enable_sequential_cpu_offload() except Exception as e: - logger.warning("error enabling sequential CPU offload: %s", e) + logger.warning("error while enabling sequential CPU offload: %s", e) elif "model-cpu-offload" in server.optimizations: # TODO: check for accelerate @@ -119,7 +119,7 @@ def optimize_pipeline( try: pipe.enable_model_cpu_offload() except Exception as e: - logger.warning("error enabling model CPU offload: %s", e) + logger.warning("error while enabling model CPU offload: %s", e) if "memory-efficient-attention" in server.optimizations: @@ -128,7 +128,7 @@ def optimize_pipeline( try: pipe.enable_xformers_memory_efficient_attention() except Exception as e: - logger.warning("error enabling memory efficient attention: %s", e) + logger.warning("error while enabling memory efficient attention: %s", e) def load_pipeline( diff --git a/api/onnx_web/server/upscale.py b/api/onnx_web/server/upscale.py index 128e7d3c4..725ae7dc4 100644 --- a/api/onnx_web/server/upscale.py +++ b/api/onnx_web/server/upscale.py @@ -34,6 +34,7 @@ def run_upscale_correction( chain = ChainPipeline() + upscale_stage = None if upscale.scale > 1: if "esrgan" in upscale.upscale_model: esrgan_params = StageParams( @@ -42,23 +43,22 @@ def run_upscale_correction( upscale_stage = (upscale_resrgan, esrgan_params, None) elif "stable-diffusion" in upscale.upscale_model: mini_tile = min(SizeChart.mini, stage.tile_size) - sd_stage = StageParams(tile_size=mini_tile, outscale=upscale.outscale) - upscale_stage = (upscale_stable_diffusion, sd_stage, None) + sd_params = StageParams(tile_size=mini_tile, outscale=upscale.outscale) + upscale_stage = (upscale_stable_diffusion, sd_params, None) else: logger.warn("unknown upscaling model: %s", upscale.upscale_model) - upscale_stage = None + correct_stage = None if upscale.faces: - face_stage = StageParams( + face_params = StageParams( tile_size=stage.tile_size, outscale=upscale.face_outscale ) if "codeformer" in upscale.correction_model: - correct_stage = (correct_codeformer, face_stage, None) + correct_stage = (correct_codeformer, face_params, None) elif "gfpgan" in upscale.correction_model: - correct_stage = (correct_gfpgan, face_stage, None) + correct_stage = (correct_gfpgan, face_params, None) else: logger.warn("unknown correction model: %s", upscale.correction_model) - correct_stage = None if upscale.upscale_order == "correction-both": chain.append(correct_stage) From 0d2211ff25b5f3a24edebe39883c30524bac95c0 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 14:14:13 -0600 Subject: [PATCH 4/8] apply lint --- api/onnx_web/diffusion/load.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index 1d8a97d30..4506ec380 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -121,7 +121,6 @@ def optimize_pipeline( except Exception as e: logger.warning("error while enabling model CPU offload: %s", e) - if "memory-efficient-attention" in server.optimizations: # TODO: check for xformers logger.debug("enabling memory efficient attention for SD pipeline") From 5b4c370a1b6b3e809be34c14ab43978918973b23 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 15:44:39 -0600 Subject: [PATCH 5/8] feat(api): enable ONNX optimizations through env --- api/onnx_web/params.py | 32 ++++++++++++++++++++++++++++---- api/onnx_web/serve.py | 17 +++++++++++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 40eb2257b..fdd7ab9ac 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -1,7 +1,10 @@ from enum import IntEnum -from typing import Any, Dict, Literal, Optional, Tuple, Union +from logging import getLogger +from typing import Any, Dict, List, Literal, Optional, Tuple, Union -from onnxruntime import SessionOptions +from onnxruntime import GraphOptimizationLevel, SessionOptions + +logger = getLogger(__name__) class SizeChart(IntEnum): @@ -75,11 +78,16 @@ def tojson(self) -> Dict[str, int]: class DeviceParams: def __init__( - self, device: str, provider: str, options: Optional[dict] = None + self, + device: str, + provider: str, + options: Optional[dict] = None, + optimizations: Optional[List[str]] = None, ) -> None: self.device = device self.provider = provider self.options = options + self.optimizations = optimizations def __str__(self) -> str: return "%s - %s (%s)" % (self.device, self.provider, self.options) @@ -91,7 +99,23 @@ def ort_provider(self) -> Tuple[str, Any]: return (self.provider, self.options) def sess_options(self) -> SessionOptions: - return SessionOptions() + sess = SessionOptions() + + if "onnx-low-memory" in self.optimizations: + logger.debug("enabling ONNX low-memory optimizations") + sess.enable_cpu_mem_arena = False + sess.enable_mem_pattern = False + sess.enable_mem_reuse = False + + if "onnx-optimization-disable" in self.optimizations: + sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL + elif "onnx-optimization-basic" in self.optimizations: + sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC + elif "onnx-optimization-all" in self.optimizations: + sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + + if "onnx-deterministic-compute" in self.optimizations: + sess.use_deterministic_compute = True def torch_str(self) -> str: if self.device.startswith("cuda"): diff --git a/api/onnx_web/serve.py b/api/onnx_web/serve.py index 4818c64cb..503be4f0c 100644 --- a/api/onnx_web/serve.py +++ b/api/onnx_web/serve.py @@ -349,16 +349,29 @@ def load_platforms(context: ServerContext) -> None: { "device_id": i, }, + context.optimizations, ) ) else: available_platforms.append( - DeviceParams(potential, platform_providers[potential]) + DeviceParams( + potential, + platform_providers[potential], + None, + context.optimizations, + ) ) if context.any_platform: # the platform should be ignored when the job is scheduled, but set to CPU just in case - available_platforms.append(DeviceParams("any", platform_providers["cpu"])) + available_platforms.append( + DeviceParams( + "any", + platform_providers["cpu"], + None, + context.optimizations, + ) + ) # make sure CPU is last on the list def any_first_cpu_last(a: DeviceParams, b: DeviceParams): From 881b290116cbf5875a21717d9301641aef539328 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 15:45:28 -0600 Subject: [PATCH 6/8] return session options properly --- api/onnx_web/params.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index fdd7ab9ac..59e911ff5 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -117,6 +117,8 @@ def sess_options(self) -> SessionOptions: if "onnx-deterministic-compute" in self.optimizations: sess.use_deterministic_compute = True + return sess + def torch_str(self) -> str: if self.device.startswith("cuda"): return self.device From e0a62ccbb5a606838c914d9a225c0a8ac306daa0 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 15:47:31 -0600 Subject: [PATCH 7/8] better ONNX optimization logging --- api/onnx_web/params.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 59e911ff5..623dd02b9 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -108,13 +108,17 @@ def sess_options(self) -> SessionOptions: sess.enable_mem_reuse = False if "onnx-optimization-disable" in self.optimizations: + logger.debug("disabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL elif "onnx-optimization-basic" in self.optimizations: + logger.debug("enabling basic ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC elif "onnx-optimization-all" in self.optimizations: + logger.debug("enabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL if "onnx-deterministic-compute" in self.optimizations: + logger.debug("enabling ONNX deterministic compute") sess.use_deterministic_compute = True return sess From bfdb071c2dfd9e1fe52342c02fb2d2381e028129 Mon Sep 17 00:00:00 2001 From: Sean Sube Date: Sat, 18 Feb 2023 16:06:05 -0600 Subject: [PATCH 8/8] chore(docs): explain model optimizations --- api/onnx_web/diffusion/load.py | 10 +++++----- api/onnx_web/params.py | 6 +++--- docs/server-admin.md | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index 4506ec380..8516fe449 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -92,28 +92,28 @@ def optimize_pipeline( server: ServerContext, pipe: StableDiffusionPipeline, ) -> None: - if "attention-slicing" in server.optimizations: + if "diffusers-attention-slicing" in server.optimizations: logger.debug("enabling attention slicing on SD pipeline") try: pipe.enable_attention_slicing() except Exception as e: logger.warning("error while enabling attention slicing: %s", e) - if "vae-slicing" in server.optimizations: + if "diffusers-vae-slicing" in server.optimizations: logger.debug("enabling VAE slicing on SD pipeline") try: pipe.enable_vae_slicing() except Exception as e: logger.warning("error while enabling VAE slicing: %s", e) - if "sequential-cpu-offload" in server.optimizations: + if "diffusers-cpu-offload-sequential" in server.optimizations: logger.debug("enabling sequential CPU offload on SD pipeline") try: pipe.enable_sequential_cpu_offload() except Exception as e: logger.warning("error while enabling sequential CPU offload: %s", e) - elif "model-cpu-offload" in server.optimizations: + elif "diffusers-cpu-offload-model" in server.optimizations: # TODO: check for accelerate logger.debug("enabling model CPU offload on SD pipeline") try: @@ -121,7 +121,7 @@ def optimize_pipeline( except Exception as e: logger.warning("error while enabling model CPU offload: %s", e) - if "memory-efficient-attention" in server.optimizations: + if "diffusers-memory-efficient-attention" in server.optimizations: # TODO: check for xformers logger.debug("enabling memory efficient attention for SD pipeline") try: diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 623dd02b9..c86a0a8b0 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -107,13 +107,13 @@ def sess_options(self) -> SessionOptions: sess.enable_mem_pattern = False sess.enable_mem_reuse = False - if "onnx-optimization-disable" in self.optimizations: + if "onnx-graph-disable" in self.optimizations: logger.debug("disabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL - elif "onnx-optimization-basic" in self.optimizations: + elif "onnx-graph-basic" in self.optimizations: logger.debug("enabling basic ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC - elif "onnx-optimization-all" in self.optimizations: + elif "onnx-graph-all" in self.optimizations: logger.debug("enabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL diff --git a/docs/server-admin.md b/docs/server-admin.md index 23e4e07b1..3372ee96d 100644 --- a/docs/server-admin.md +++ b/docs/server-admin.md @@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea - [Configuration](#configuration) - [Debug Mode](#debug-mode) - [Environment Variables](#environment-variables) + - [Pipeline Optimizations](#pipeline-optimizations) - [Server Parameters](#server-parameters) - [Containers](#containers) - [CPU](#cpu) @@ -73,6 +74,39 @@ Others: - `ONNX_WEB_SHOW_PROGRESS` - show progress bars in the logs - disabling this can reduce noise in server logs, especially when logging to a file +- `ONNX_WEB_OPTIMIZATIONS` + - comma-delimited list of optimizations to enable + +### Pipeline Optimizations + +- `diffusers-*` + - `diffusers-attention-slicing` + - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings + - `diffusers-cpu-offload-*` + - `diffusers-cpu-offload-sequential` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings + - `diffusers-cpu-offload-model` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings + - `diffusers-memory-efficient-attention` + - requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers) + - https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention + - `diffusers-vae-slicing` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches +- `onnx-*` + - `onnx-low-memory` + - disable ONNX features that allocate more memory than is strictly required or keep memory after use + - `onnx-graph-*` + - `onnx-graph-disable` + - disable all ONNX graph optimizations + - `onnx-graph-basic` + - enable basic ONNX graph optimizations + - `onnx-graph-all` + - enable all ONNX graph optimizations + - `onnx-deterministic-compute` + - enable ONNX deterministic compute ### Server Parameters