diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py index 4506ec380..8516fe449 100644 --- a/api/onnx_web/diffusion/load.py +++ b/api/onnx_web/diffusion/load.py @@ -92,28 +92,28 @@ def optimize_pipeline( server: ServerContext, pipe: StableDiffusionPipeline, ) -> None: - if "attention-slicing" in server.optimizations: + if "diffusers-attention-slicing" in server.optimizations: logger.debug("enabling attention slicing on SD pipeline") try: pipe.enable_attention_slicing() except Exception as e: logger.warning("error while enabling attention slicing: %s", e) - if "vae-slicing" in server.optimizations: + if "diffusers-vae-slicing" in server.optimizations: logger.debug("enabling VAE slicing on SD pipeline") try: pipe.enable_vae_slicing() except Exception as e: logger.warning("error while enabling VAE slicing: %s", e) - if "sequential-cpu-offload" in server.optimizations: + if "diffusers-cpu-offload-sequential" in server.optimizations: logger.debug("enabling sequential CPU offload on SD pipeline") try: pipe.enable_sequential_cpu_offload() except Exception as e: logger.warning("error while enabling sequential CPU offload: %s", e) - elif "model-cpu-offload" in server.optimizations: + elif "diffusers-cpu-offload-model" in server.optimizations: # TODO: check for accelerate logger.debug("enabling model CPU offload on SD pipeline") try: @@ -121,7 +121,7 @@ def optimize_pipeline( except Exception as e: logger.warning("error while enabling model CPU offload: %s", e) - if "memory-efficient-attention" in server.optimizations: + if "diffusers-memory-efficient-attention" in server.optimizations: # TODO: check for xformers logger.debug("enabling memory efficient attention for SD pipeline") try: diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py index 623dd02b9..c86a0a8b0 100644 --- a/api/onnx_web/params.py +++ b/api/onnx_web/params.py @@ -107,13 +107,13 @@ def sess_options(self) -> SessionOptions: sess.enable_mem_pattern = False sess.enable_mem_reuse = False - if "onnx-optimization-disable" in self.optimizations: + if "onnx-graph-disable" in self.optimizations: logger.debug("disabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL - elif "onnx-optimization-basic" in self.optimizations: + elif "onnx-graph-basic" in self.optimizations: logger.debug("enabling basic ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC - elif "onnx-optimization-all" in self.optimizations: + elif "onnx-graph-all" in self.optimizations: logger.debug("enabling all ONNX graph optimizations") sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL diff --git a/docs/server-admin.md b/docs/server-admin.md index 23e4e07b1..3372ee96d 100644 --- a/docs/server-admin.md +++ b/docs/server-admin.md @@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea - [Configuration](#configuration) - [Debug Mode](#debug-mode) - [Environment Variables](#environment-variables) + - [Pipeline Optimizations](#pipeline-optimizations) - [Server Parameters](#server-parameters) - [Containers](#containers) - [CPU](#cpu) @@ -73,6 +74,39 @@ Others: - `ONNX_WEB_SHOW_PROGRESS` - show progress bars in the logs - disabling this can reduce noise in server logs, especially when logging to a file +- `ONNX_WEB_OPTIMIZATIONS` + - comma-delimited list of optimizations to enable + +### Pipeline Optimizations + +- `diffusers-*` + - `diffusers-attention-slicing` + - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings + - `diffusers-cpu-offload-*` + - `diffusers-cpu-offload-sequential` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings + - `diffusers-cpu-offload-model` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings + - `diffusers-memory-efficient-attention` + - requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers) + - https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention + - `diffusers-vae-slicing` + - not available for ONNX pipelines (most of them) + - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches +- `onnx-*` + - `onnx-low-memory` + - disable ONNX features that allocate more memory than is strictly required or keep memory after use + - `onnx-graph-*` + - `onnx-graph-disable` + - disable all ONNX graph optimizations + - `onnx-graph-basic` + - enable basic ONNX graph optimizations + - `onnx-graph-all` + - enable all ONNX graph optimizations + - `onnx-deterministic-compute` + - enable ONNX deterministic compute ### Server Parameters