chore(docs): explain model optimizations

ssube · Feb 18, 2023 · bfdb071 · bfdb071
1 parent e0a62cc
commit bfdb071
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 8 deletions.
diff --git a/api/onnx_web/diffusion/load.py b/api/onnx_web/diffusion/load.py
@@ -92,36 +92,36 @@ def optimize_pipeline(
     server: ServerContext,
     pipe: StableDiffusionPipeline,
 ) -> None:
-    if "attention-slicing" in server.optimizations:
+    if "diffusers-attention-slicing" in server.optimizations:
         logger.debug("enabling attention slicing on SD pipeline")
         try:
             pipe.enable_attention_slicing()
         except Exception as e:
             logger.warning("error while enabling attention slicing: %s", e)
 
-    if "vae-slicing" in server.optimizations:
+    if "diffusers-vae-slicing" in server.optimizations:
         logger.debug("enabling VAE slicing on SD pipeline")
         try:
             pipe.enable_vae_slicing()
         except Exception as e:
             logger.warning("error while enabling VAE slicing: %s", e)
 
-    if "sequential-cpu-offload" in server.optimizations:
+    if "diffusers-cpu-offload-sequential" in server.optimizations:
         logger.debug("enabling sequential CPU offload on SD pipeline")
         try:
             pipe.enable_sequential_cpu_offload()
         except Exception as e:
             logger.warning("error while enabling sequential CPU offload: %s", e)
 
-    elif "model-cpu-offload" in server.optimizations:
+    elif "diffusers-cpu-offload-model" in server.optimizations:
         # TODO: check for accelerate
         logger.debug("enabling model CPU offload on SD pipeline")
         try:
             pipe.enable_model_cpu_offload()
         except Exception as e:
             logger.warning("error while enabling model CPU offload: %s", e)
 
-    if "memory-efficient-attention" in server.optimizations:
+    if "diffusers-memory-efficient-attention" in server.optimizations:
         # TODO: check for xformers
         logger.debug("enabling memory efficient attention for SD pipeline")
         try:

diff --git a/api/onnx_web/params.py b/api/onnx_web/params.py
@@ -107,13 +107,13 @@ def sess_options(self) -> SessionOptions:
             sess.enable_mem_pattern = False
             sess.enable_mem_reuse = False
 
-        if "onnx-optimization-disable" in self.optimizations:
+        if "onnx-graph-disable" in self.optimizations:
             logger.debug("disabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
-        elif "onnx-optimization-basic" in self.optimizations:
+        elif "onnx-graph-basic" in self.optimizations:
             logger.debug("enabling basic ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
-        elif "onnx-optimization-all" in self.optimizations:
+        elif "onnx-graph-all" in self.optimizations:
             logger.debug("enabling all ONNX graph optimizations")
             sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
 

diff --git a/docs/server-admin.md b/docs/server-admin.md
@@ -11,6 +11,7 @@ Please see [the user guide](user-guide.md) for descriptions of the client and ea
   - [Configuration](#configuration)
     - [Debug Mode](#debug-mode)
     - [Environment Variables](#environment-variables)
+    - [Pipeline Optimizations](#pipeline-optimizations)
     - [Server Parameters](#server-parameters)
   - [Containers](#containers)
     - [CPU](#cpu)
@@ -73,6 +74,39 @@ Others:
 - `ONNX_WEB_SHOW_PROGRESS`
   - show progress bars in the logs
   - disabling this can reduce noise in server logs, especially when logging to a file
+- `ONNX_WEB_OPTIMIZATIONS`
+  - comma-delimited list of optimizations to enable
+
+### Pipeline Optimizations
+
+- `diffusers-*`
+  - `diffusers-attention-slicing`
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-attention-for-additional-memory-savings
+  - `diffusers-cpu-offload-*`
+    - `diffusers-cpu-offload-sequential`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#offloading-to-cpu-with-accelerate-for-memory-savings
+    - `diffusers-cpu-offload-model`
+      - not available for ONNX pipelines (most of them)
+      - https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings
+  - `diffusers-memory-efficient-attention`
+    - requires [the `xformers` library](https://huggingface.co/docs/diffusers/optimization/xformers)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention
+  - `diffusers-vae-slicing`
+    - not available for ONNX pipelines (most of them)
+    - https://huggingface.co/docs/diffusers/optimization/fp16#sliced-vae-decode-for-larger-batches
+- `onnx-*`
+  - `onnx-low-memory`
+    - disable ONNX features that allocate more memory than is strictly required or keep memory after use
+  - `onnx-graph-*`
+    - `onnx-graph-disable`
+      - disable all ONNX graph optimizations
+    - `onnx-graph-basic`
+      - enable basic ONNX graph optimizations
+    - `onnx-graph-all`
+      - enable all ONNX graph optimizations
+  - `onnx-deterministic-compute`
+    - enable ONNX deterministic compute
 
 ### Server Parameters