From ab6462d095fe2305fd5a8e6ab9ac794662880a99 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 11:53:13 -0600
Subject: [PATCH 1/8] feat(api): enable optimizations for SD pipelines based on
 env vars (#155)

---
 .../chain/upscale_stable_diffusion.py         |  3 ++
 api/onnx_web/diffusion/load.py                | 29 +++++++++++++++++++
 api/onnx_web/utils.py                         |  3 ++
 3 files changed, 35 insertions(+)

diff --git a/api/onnx_web/chain/upscale_stable_diffusion.py b/api/onnx_web/chain/upscale_stable_diffusion.py
index c2032bfbc..ffdb00363 100644
--- a/api/onnx_web/chain/upscale_stable_diffusion.py
+++ b/api/onnx_web/chain/upscale_stable_diffusion.py
@@ -5,6 +5,7 @@
 from diffusers import StableDiffusionUpscalePipeline
 from PIL import Image
 
+from ..diffusion.load import optimize_pipeline
 from ..diffusion.pipeline_onnx_stable_diffusion_upscale import (
     OnnxStableDiffusionUpscalePipeline,
 )
@@ -52,6 +53,8 @@ def load_stable_diffusion(
     if not server.show_progress:
         pipe.set_progress_bar_config(disable=True)
 
+    optimize_pipeline(server, pipe)
+
     server.cache.set("diffusion", cache_key, pipe)
     run_gc([device])
 
diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
index dceb509f2..53cd6af06 100644
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@@ -17,6 +17,7 @@
     KDPM2DiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
+    StableDiffusionPipeline,
 )
 
 try:
@@ -87,6 +88,32 @@ def get_tile_latents(
     return full_latents[:, :, y:yt, x:xt]
 
 
+def optimize_pipeline(
+    server: ServerContext,
+    pipe: StableDiffusionPipeline,
+) -> None:
+    if "attention-slicing" in server.optimizations:
+        logger.debug("enabling attention slicing on SD pipeline")
+        pipe.enable_attention_slicing()
+
+    if "vae-slicing" in server.optimizations:
+        logger.debug("enabling VAE slicing on SD pipeline")
+        pipe.enable_vae_slicing()
+
+    if "sequential-cpu-offload" in server.optimizations:
+        logger.debug("enabling sequential CPU offload on SD pipeline")
+        pipe.enable_sequential_cpu_offload()
+    elif "model-cpu-offload" in server.optimizations:
+        # TODO: check for accelerate
+        logger.debug("enabling model CPU offload on SD pipeline")
+        pipe.enable_model_cpu_offload()
+
+    if "memory-efficient-attention" in server.optimizations:
+        # TODO: check for xformers
+        logger.debug("enabling memory efficient attention for SD pipeline")
+        pipe.enable_xformers_memory_efficient_attention()
+
+
 def load_pipeline(
     server: ServerContext,
     pipeline: DiffusionPipeline,
@@ -151,6 +178,8 @@ def load_pipeline(
         if not server.show_progress:
             pipe.set_progress_bar_config(disable=True)
 
+        optimize_pipeline(server, pipe)
+
         if device is not None and hasattr(pipe, "to"):
             pipe = pipe.to(device.torch_str())
 
diff --git a/api/onnx_web/utils.py b/api/onnx_web/utils.py
index 598606390..20d9d32c3 100644
--- a/api/onnx_web/utils.py
+++ b/api/onnx_web/utils.py
@@ -28,6 +28,7 @@ def __init__(
         cache: ModelCache = None,
         cache_path: str = None,
         show_progress: bool = True,
+        optimizations: List[str] = [],
     ) -> None:
         self.bundle_path = bundle_path
         self.model_path = model_path
@@ -42,6 +43,7 @@ def __init__(
         self.cache = cache or ModelCache(num_workers)
         self.cache_path = cache_path or path.join(model_path, ".cache")
         self.show_progress = show_progress
+        self.optimizations = optimizations
 
     @classmethod
     def from_environ(cls):
@@ -64,6 +66,7 @@ def from_environ(cls):
             image_format=environ.get("ONNX_WEB_IMAGE_FORMAT", "png"),
             cache=ModelCache(limit=cache_limit),
             show_progress=get_boolean(environ, "ONNX_WEB_SHOW_PROGRESS", True),
+            optimizations=environ.get("ONNX_WEB_OPTIMIZATIONS", "").split(","),
         )
 
 

From 118695d68cd7cc37df1de66ffe66c0f83dc6c819 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 11:57:18 -0600
Subject: [PATCH 2/8] fix(api): add error handling for optimizations

---
 api/onnx_web/diffusion/load.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
index 53cd6af06..38afb0abb 100644
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@@ -94,24 +94,41 @@ def optimize_pipeline(
 ) -> None:
     if "attention-slicing" in server.optimizations:
         logger.debug("enabling attention slicing on SD pipeline")
-        pipe.enable_attention_slicing()
+        try:
+            pipe.enable_attention_slicing()
+        except Exception as e:
+            logger.warning("error enabling attention slicing: %s", e)
 
     if "vae-slicing" in server.optimizations:
         logger.debug("enabling VAE slicing on SD pipeline")
-        pipe.enable_vae_slicing()
+        try:
+            pipe.enable_vae_slicing()
+        except Exception as e:
+            logger.warning("error enabling VAE slicing: %s", e)
 
     if "sequential-cpu-offload" in server.optimizations:
         logger.debug("enabling sequential CPU offload on SD pipeline")
-        pipe.enable_sequential_cpu_offload()
+        try:
+            pipe.enable_sequential_cpu_offload()
+        except Exception as e:
+            logger.warning("error enabling sequential CPU offload: %s", e)
+
     elif "model-cpu-offload" in server.optimizations:
         # TODO: check for accelerate
         logger.debug("enabling model CPU offload on SD pipeline")
-        pipe.enable_model_cpu_offload()
+        try:
+            pipe.enable_model_cpu_offload()
+        except Exception as e:
+            logger.warning("error enabling model CPU offload: %s", e)
+
 
     if "memory-efficient-attention" in server.optimizations:
         # TODO: check for xformers
         logger.debug("enabling memory efficient attention for SD pipeline")
-        pipe.enable_xformers_memory_efficient_attention()
+        try:
+            pipe.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            logger.warning("error enabling memory efficient attention: %s", e)
 
 
 def load_pipeline(

From f534fbb92cf7b83c0d3de614fb24de35d99a3041 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 11:59:39 -0600
Subject: [PATCH 3/8] fix(api): restore separate upscale and correction stages

---
 api/onnx_web/diffusion/load.py | 10 +++++-----
 api/onnx_web/server/upscale.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
index 38afb0abb..1d8a97d30 100644
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@@ -97,21 +97,21 @@ def optimize_pipeline(
         try:
             pipe.enable_attention_slicing()
         except Exception as e:
-            logger.warning("error enabling attention slicing: %s", e)
+            logger.warning("error while enabling attention slicing: %s", e)
 
     if "vae-slicing" in server.optimizations:
         logger.debug("enabling VAE slicing on SD pipeline")
         try:
             pipe.enable_vae_slicing()
         except Exception as e:
-            logger.warning("error enabling VAE slicing: %s", e)
+            logger.warning("error while enabling VAE slicing: %s", e)
 
     if "sequential-cpu-offload" in server.optimizations:
         logger.debug("enabling sequential CPU offload on SD pipeline")
         try:
             pipe.enable_sequential_cpu_offload()
         except Exception as e:
-            logger.warning("error enabling sequential CPU offload: %s", e)
+            logger.warning("error while enabling sequential CPU offload: %s", e)
 
     elif "model-cpu-offload" in server.optimizations:
         # TODO: check for accelerate
@@ -119,7 +119,7 @@ def optimize_pipeline(
         try:
             pipe.enable_model_cpu_offload()
         except Exception as e:
-            logger.warning("error enabling model CPU offload: %s", e)
+            logger.warning("error while enabling model CPU offload: %s", e)
 
 
     if "memory-efficient-attention" in server.optimizations:
@@ -128,7 +128,7 @@ def optimize_pipeline(
         try:
             pipe.enable_xformers_memory_efficient_attention()
         except Exception as e:
-            logger.warning("error enabling memory efficient attention: %s", e)
+            logger.warning("error while enabling memory efficient attention: %s", e)
 
 
 def load_pipeline(
diff --git a/api/onnx_web/server/upscale.py b/api/onnx_web/server/upscale.py
index 128e7d3c4..725ae7dc4 100644
--- a/api/onnx_web/server/upscale.py
+++ b/api/onnx_web/server/upscale.py
@@ -34,6 +34,7 @@ def run_upscale_correction(
 
     chain = ChainPipeline()
 
+    upscale_stage = None
     if upscale.scale > 1:
         if "esrgan" in upscale.upscale_model:
             esrgan_params = StageParams(
@@ -42,23 +43,22 @@ def run_upscale_correction(
             upscale_stage = (upscale_resrgan, esrgan_params, None)
         elif "stable-diffusion" in upscale.upscale_model:
             mini_tile = min(SizeChart.mini, stage.tile_size)
-            sd_stage = StageParams(tile_size=mini_tile, outscale=upscale.outscale)
-            upscale_stage = (upscale_stable_diffusion, sd_stage, None)
+            sd_params = StageParams(tile_size=mini_tile, outscale=upscale.outscale)
+            upscale_stage = (upscale_stable_diffusion, sd_params, None)
         else:
             logger.warn("unknown upscaling model: %s", upscale.upscale_model)
-            upscale_stage = None
 
+    correct_stage = None
     if upscale.faces:
-        face_stage = StageParams(
+        face_params = StageParams(
             tile_size=stage.tile_size, outscale=upscale.face_outscale
         )
         if "codeformer" in upscale.correction_model:
-            correct_stage = (correct_codeformer, face_stage, None)
+            correct_stage = (correct_codeformer, face_params, None)
         elif "gfpgan" in upscale.correction_model:
-            correct_stage = (correct_gfpgan, face_stage, None)
+            correct_stage = (correct_gfpgan, face_params, None)
         else:
             logger.warn("unknown correction model: %s", upscale.correction_model)
-            correct_stage = None
 
     if upscale.upscale_order == "correction-both":
         chain.append(correct_stage)

From 0d2211ff25b5f3a24edebe39883c30524bac95c0 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 14:14:13 -0600
Subject: [PATCH 4/8] apply lint

---
 api/onnx_web/diffusion/load.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
index 1d8a97d30..4506ec380 100644
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@@ -121,7 +121,6 @@ def optimize_pipeline(
         except Exception as e:
             logger.warning("error while enabling model CPU offload: %s", e)
 
-
     if "memory-efficient-attention" in server.optimizations:
         # TODO: check for xformers
         logger.debug("enabling memory efficient attention for SD pipeline")

From 5b4c370a1b6b3e809be34c14ab43978918973b23 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 15:44:39 -0600
Subject: [PATCH 5/8] feat(api): enable ONNX optimizations through env

---
 api/onnx_web/params.py | 32 ++++++++++++++++++++++++++++----
 api/onnx_web/serve.py  | 17 +++++++++++++++--
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py
index 40eb2257b..fdd7ab9ac 100644
--- a/api/onnx_web/params.py
+++ b/api/onnx_web/params.py
@@ -1,7 +1,10 @@
 from enum import IntEnum
-from typing import Any, Dict, Literal, Optional, Tuple, Union
+from logging import getLogger
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
-from onnxruntime import SessionOptions
+from onnxruntime import GraphOptimizationLevel, SessionOptions
+
+logger = getLogger(__name__)
 
 
 class SizeChart(IntEnum):
@@ -75,11 +78,16 @@ def tojson(self) -> Dict[str, int]:
 
 class DeviceParams:
     def __init__(
-        self, device: str, provider: str, options: Optional[dict] = None
+        self,
+        device: str,
+        provider: str,
+        options: Optional[dict] = None,
+        optimizations: Optional[List[str]] = None,
     ) -> None:
         self.device = device
         self.provider = provider
         self.options = options
+        self.optimizations = optimizations
 
     def __str__(self) -> str:
         return "%s - %s (%s)" % (self.device, self.provider, self.options)
@@ -91,7 +99,23 @@ def ort_provider(self) -> Tuple[str, Any]:
             return (self.provider, self.options)
 
     def sess_options(self) -> SessionOptions:
-        return SessionOptions()
+        sess = SessionOptions()
+
+        if "onnx-low-memory" in self.optimizations:
+            logger.debug("enabling ONNX low-memory optimizations")
+            sess.enable_cpu_mem_arena = False
+            sess.enable_mem_pattern = False
+            sess.enable_mem_reuse = False
+
+        if "onnx-optimization-disable" in self.optimizations:
+            sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
+        elif "onnx-optimization-basic" in self.optimizations:
+            sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
+        elif "onnx-optimization-all" in self.optimizations:
+            sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        if "onnx-deterministic-compute" in self.optimizations:
+            sess.use_deterministic_compute = True
 
     def torch_str(self) -> str:
         if self.device.startswith("cuda"):
diff --git a/api/onnx_web/serve.py b/api/onnx_web/serve.py
index 4818c64cb..503be4f0c 100644
--- a/api/onnx_web/serve.py
+++ b/api/onnx_web/serve.py
@@ -349,16 +349,29 @@ def load_platforms(context: ServerContext) -> None:
                             {
                                 "device_id": i,
                             },
+                            context.optimizations,
                         )
                     )
             else:
                 available_platforms.append(
-                    DeviceParams(potential, platform_providers[potential])
+                    DeviceParams(
+                        potential,
+                        platform_providers[potential],
+                        None,
+                        context.optimizations,
+                    )
                 )
 
     if context.any_platform:
         # the platform should be ignored when the job is scheduled, but set to CPU just in case
-        available_platforms.append(DeviceParams("any", platform_providers["cpu"]))
+        available_platforms.append(
+            DeviceParams(
+                "any",
+                platform_providers["cpu"],
+                None,
+                context.optimizations,
+            )
+        )
 
     # make sure CPU is last on the list
     def any_first_cpu_last(a: DeviceParams, b: DeviceParams):

From 881b290116cbf5875a21717d9301641aef539328 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 15:45:28 -0600
Subject: [PATCH 6/8] return session options properly

---
 api/onnx_web/params.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py
index fdd7ab9ac..59e911ff5 100644
--- a/api/onnx_web/params.py
+++ b/api/onnx_web/params.py
@@ -117,6 +117,8 @@ def sess_options(self) -> SessionOptions:
         if "onnx-deterministic-compute" in self.optimizations:
             sess.use_deterministic_compute = True
 
+        return sess
+
     def torch_str(self) -> str:
         if self.device.startswith("cuda"):
             return self.device

From e0a62ccbb5a606838c914d9a225c0a8ac306daa0 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 15:47:31 -0600
Subject: [PATCH 7/8] better ONNX optimization logging

---
 api/onnx_web/params.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py
index 59e911ff5..623dd02b9 100644
--- a/api/onnx_web/params.py
+++ b/api/onnx_web/params.py
@@ -108,13 +108,17 @@ def sess_options(self) -> SessionOptions:
             sess.enable_mem_reuse = False
 
         if "onnx-optimization-disable" in self.optimizations:
+            logger.debug("disabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
         elif "onnx-optimization-basic" in self.optimizations:
+            logger.debug("enabling basic ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
         elif "onnx-optimization-all" in self.optimizations:
+            logger.debug("enabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
 
         if "onnx-deterministic-compute" in self.optimizations:
+            logger.debug("enabling ONNX deterministic compute")
             sess.use_deterministic_compute = True
 
         return sess

From bfdb071c2dfd9e1fe52342c02fb2d2381e028129 Mon Sep 17 00:00:00 2001
From: Sean Sube <seansube@gmail.com>
Date: Sat, 18 Feb 2023 16:06:05 -0600
Subject: [PATCH 8/8] chore(docs): explain model optimizations

---
 api/onnx_web/diffusion/load.py | 10 +++++-----
 api/onnx_web/params.py         |  6 +++---
 docs/server-admin.md           | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
index 4506ec380..8516fe449 100644
--- a/api/onnx_web/diffusion/load.py
+++ b/api/onnx_web/diffusion/load.py
@@ -92,28 +92,28 @@ def optimize_pipeline(
     server: ServerContext,
     pipe: StableDiffusionPipeline,
 ) -> None:
-    if "attention-slicing" in server.optimizations:
+    if "diffusers-attention-slicing" in server.optimizations:
         logger.debug("enabling attention slicing on SD pipeline")
         try:
             pipe.enable_attention_slicing()
         except Exception as e:
             logger.warning("error while enabling attention slicing: %s", e)
 
-    if "vae-slicing" in server.optimizations:
+    if "diffusers-vae-slicing" in server.optimizations:
         logger.debug("enabling VAE slicing on SD pipeline")
         try:
             pipe.enable_vae_slicing()
         except Exception as e:
             logger.warning("error while enabling VAE slicing: %s", e)
 
-    if "sequential-cpu-offload" in server.optimizations:
+    if "diffusers-cpu-offload-sequential" in server.optimizations:
         logger.debug("enabling sequential CPU offload on SD pipeline")
         try:
             pipe.enable_sequential_cpu_offload()
         except Exception as e:
             logger.warning("error while enabling sequential CPU offload: %s", e)
 
-    elif "model-cpu-offload" in server.optimizations:
+    elif "diffusers-cpu-offload-model" in server.optimizations:
         # TODO: check for accelerate
         logger.debug("enabling model CPU offload on SD pipeline")
         try:
@@ -121,7 +121,7 @@ def optimize_pipeline(
         except Exception as e:
             logger.warning("error while enabling model CPU offload: %s", e)
 
-    if "memory-efficient-attention" in server.optimizations:
+    if "diffusers-memory-efficient-attention" in server.optimizations:
         # TODO: check for xformers
         logger.debug("enabling memory efficient attention for SD pipeline")
         try:
diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py
index 623dd02b9..c86a0a8b0 100644
--- a/api/onnx_web/params.py
+++ b/api/onnx_web/params.py
@@ -107,13 +107,13 @@ def sess_options(self) -> SessionOptions:
             sess.enable_mem_pattern = False
             sess.enable_mem_reuse = False
 
-        if "onnx-optimization-disable" in self.optimizations:
+        if "onnx-graph-disable" in self.optimizations:
             logger.debug("disabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
-        elif "onnx-optimization-basic" in self.optimizations:
+        elif "onnx-graph-basic" in self.optimizations:
             logger.debug("enabling basic ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
-        elif "onnx-optimization-all" in self.optimizations:
+        elif "onnx-graph-all" in self.optimizations:
             logger.debug("enabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
 
diff --git a/docs/server-admin.md b/docs/server-admin.md
index 23e4e07b1..3372ee96d 100644
--- a/docs/server-admin.md
+++ b/docs/server-admin.md
@@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea
   - [Configuration](#configuration)
     - [Debug Mode](#debug-mode)
     - [Environment Variables](#environment-variables)
+    - [Pipeline Optimizations](#pipeline-optimizations)
     - [Server Parameters](#server-parameters)
   - [Containers](#containers)
     - [CPU](#cpu)
@@ -73,6 +74,39 @@ Others:
 - `ONNX_WEB_SHOW_PROGRESS`
   - show progress bars in the logs
   - disabling this can reduce noise in server logs, especially when logging to a file
+- `ONNX_WEB_OPTIMIZATIONS`
+  - comma-delimited list of optimizations to enable
+
+### Pipeline Optimizations
+
+- `diffusers-*`
+  - `diffusers-attention-slicing`
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings
+  - `diffusers-cpu-offload-*`
+    - `diffusers-cpu-offload-sequential`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings
+    - `diffusers-cpu-offload-model`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings
+  - `diffusers-memory-efficient-attention`
+    - requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention
+  - `diffusers-vae-slicing`
+    - not available for ONNX pipelines (most of them)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches
+- `onnx-*`
+  - `onnx-low-memory`
+    - disable ONNX features that allocate more memory than is strictly required or keep memory after use
+  - `onnx-graph-*`
+    - `onnx-graph-disable`
+      - disable all ONNX graph optimizations
+    - `onnx-graph-basic`
+      - enable basic ONNX graph optimizations
+    - `onnx-graph-all`
+      - enable all ONNX graph optimizations
+  - `onnx-deterministic-compute`
+    - enable ONNX deterministic compute
 
 ### Server Parameters