In [1]:
import os
import torch
import torch.nn as nn
from diffusers.schedulers import PNDMScheduler
from pathlib import Path
from diffusers import DiffusionPipeline
import onnx
import onnxruntime
from onnxruntime.quantization.quantize import quantize_dynamic
from onnxruntime.quantization import QuantType

from util_onnx import onnx_export
import utils

import gc

gc.collect()

0

In [2]:
device = 'cuda'
dtype = torch.float32
save_path = '../onnx_models_cuda'
os.makedirs(save_path, exist_ok = True)

pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)

Couldn't connect to the Hub: 401 Client Error. (Request ID: Root=1-66e5d547-51132e2764dc4e5825c24f19;f25a31fc-80d3-493e-b392-fb6b1b9b499e)

Repository Not Found for url: https://huggingface.co/api/models/runwayml/stable-diffusion-v1-5.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Invalid username or password..
Will try to load from local cache.


---
##### 0. INPUT 준비

In [3]:
noise_pred  = torch.randn([1,4,64,64]).to(device=device, dtype=dtype)
latents     = torch.randn([1,4,64,64]).to(device=device, dtype=dtype)
noise_pred.shape, latents.shape

(torch.Size([1, 4, 64, 64]), torch.Size([1, 4, 64, 64]))

In [5]:
pipeline.scheduler.set_timesteps(50, device=device)
alphas_cumprod = pipeline.scheduler.alphas_cumprod
final_alpha_cumprod = pipeline.scheduler.final_alpha_cumprod
f"alphas_cumprod : {alphas_cumprod}, final_alpha_cumprod : {final_alpha_cumprod}"

'alphas_cumprod : tensor([0.9991, 0.9983, 0.9974, 0.9966, 0.9957, 0.9948, 0.9940, 0.9931, 0.9922,\n        0.9913, 0.9904, 0.9895, 0.9886, 0.9877, 0.9868, 0.9859, 0.9850, 0.9841,\n        0.9832, 0.9822, 0.9813, 0.9804, 0.9794, 0.9785, 0.9776, 0.9766, 0.9757,\n        0.9747, 0.9737, 0.9728, 0.9718, 0.9708, 0.9698, 0.9689, 0.9679, 0.9669,\n        0.9659, 0.9649, 0.9639, 0.9629, 0.9619, 0.9609, 0.9599, 0.9588, 0.9578,\n        0.9568, 0.9557, 0.9547, 0.9537, 0.9526, 0.9516, 0.9505, 0.9495, 0.9484,\n        0.9473, 0.9463, 0.9452, 0.9441, 0.9430, 0.9420, 0.9409, 0.9398, 0.9387,\n        0.9376, 0.9365, 0.9354, 0.9343, 0.9332, 0.9320, 0.9309, 0.9298, 0.9287,\n        0.9275, 0.9264, 0.9252, 0.9241, 0.9229, 0.9218, 0.9206, 0.9195, 0.9183,\n        0.9171, 0.9160, 0.9148, 0.9136, 0.9124, 0.9112, 0.9100, 0.9089, 0.9077,\n        0.9065, 0.9052, 0.9040, 0.9028, 0.9016, 0.9004, 0.8992, 0.8979, 0.8967,\n        0.8955, 0.8942, 0.8930, 0.8917, 0.8905, 0.8892, 0.8880, 0.8867, 0.8854,\n        0.

--- 
##### 1. Define model

`t` = 981

In [15]:
class Scheduler_step981(nn.Module):
    def __init__(
        self, 
        alphas_cumprod, 
        final_alpha_cumprod
    ):
        super().__init__()
        self.alphas_cumprod = alphas_cumprod
        self.final_alpha_cumprod = final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample

    def forward(self, model_output, sample):
        timestep, prev_timestep = 981, 961
        # ets = [model_output]
        # cur_output = sample
        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
        return prev_sample#, cur_sample

onnx_export(
    Scheduler_step981(alphas_cumprod, final_alpha_cumprod),
    model_args=(
        noise_pred, latents
    ),
    output_path = Path(f'{save_path}/scheduler_2/sche981_origin.onnx'),
    ordered_input_names=["model_output", "sample"],
    output_names=["prev_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "model_output": {0: "batch"},
        "sample": {0: "batch"}
    },
    opset=12,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)

! python -m onnxruntime.quantization.preprocess --input '../onnx_models_cuda/scheduler_2/sche981_origin.onnx' --output '../onnx_models_cuda/scheduler_2/sche981_infer.onnx'

# # model quantization
quantize_dynamic(
    model_input     =   f'{save_path}/scheduler_2/sche981_infer.onnx', 
    model_output    =   f'{save_path}/scheduler_2/sche981_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

ONNX export Start🚗
ONNX export Finish🍷


`t` = 961

In [29]:
class Scheduler_step961_1(nn.Module):
    def __init__(
        self, 
        alphas_cumprod, 
        final_alpha_cumprod
    ):
        super().__init__()
        self.alphas_cumprod = alphas_cumprod.to(device = device)
        self.final_alpha_cumprod = final_alpha_cumprod.to(device = device)

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample    

    def forward(self, model_output, cur_sample, ets0):
        timestep, prev_timestep = 981, 961
        model_output = (model_output + ets0) / 2
        sample = cur_sample
        # cur_sample = None
        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
        return prev_sample
    
onnx_export(
    Scheduler_step961_1(alphas_cumprod, final_alpha_cumprod),
    model_args=(
        noise_pred, latents, latents
    ),
    output_path = Path(f'{save_path}/scheduler_2/sche9611_origin.onnx'),
    ordered_input_names=["model_output", "cur_sample", "ets0"],
    output_names=["prev_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "model_output"  : {0: "batch"},
        # "sample"        : {0: "batch"},
        "cur_sample"    : {0: "batch"},
        "ets0"           : {0: "batch"}
    },
    opset=12,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)

! python -m onnxruntime.quantization.preprocess --input '../onnx_models_cuda/scheduler_2/sche9611_origin.onnx' --output '../onnx_models_cuda/scheduler_2/sche9611_infer.onnx'

quantize_dynamic(
    model_input     =   f'{save_path}/scheduler_2/sche9611_infer.onnx', 
    model_output    =   f'{save_path}/scheduler_2/sche9611_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

ONNX export Start🚗


TypeError: Scheduler_step961_1.forward() takes 4 positional arguments but 5 were given

In [19]:
class Scheduler_step961_2(nn.Module):
    def __init__(
        self, 
        alphas_cumprod, 
        final_alpha_cumprod
    ):
        super().__init__()
        self.alphas_cumprod = alphas_cumprod.to(device = device)
        self.final_alpha_cumprod = final_alpha_cumprod.to(device = device)

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample    
    
    def forward(self, model_output, sample, ets0):
        timestep, prev_timestep = 961, 941
        ets = [ets0, model_output]
        model_output = (3 * ets[-1] - ets[-2]) / 2
        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
        return prev_sample
    
onnx_export(
    Scheduler_step961_2(alphas_cumprod, final_alpha_cumprod),
    model_args=(
        noise_pred, latents, latents
    ),
    output_path = Path(f'{save_path}/scheduler_2/sche9612_origin.onnx'),
    ordered_input_names=["model_output", "sample", "ets0"],
    output_names=["prev_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "model_output"  : {0: "batch"},
        "sample"        : {0: "batch"},
        "ets0"    : {0: "batch"},
    },
    opset=12,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)

! python -m onnxruntime.quantization.preprocess --input '../onnx_models_cuda/scheduler_2/sche9612_origin.onnx' --output '../onnx_models_cuda/scheduler_2/sche9612_infer.onnx'

# # model quantization
quantize_dynamic(
    model_input     =   f'{save_path}/scheduler_2/sche9612_infer.onnx', 
    model_output    =   f'{save_path}/scheduler_2/sche9612_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

ONNX export Start🚗
ONNX export Finish🍷


`t` = 941

In [23]:
class Scheduler_step941(nn.Module):
    def __init__(
        self, 
        alphas_cumprod, 
        final_alpha_cumprod
    ):
        super().__init__()
        self.alphas_cumprod = alphas_cumprod.to(device = device)
        self.final_alpha_cumprod = final_alpha_cumprod.to(device = device)
    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample 
    def forward(self, model_output, sample, ets0, ets1):
        timestep, prev_timestep = 941, 921
        ets = [ets0, ets1, model_output]
        model_output = (23 * ets[-1] - 16 * ets[-2] + 5 * ets[-3]) / 12
        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
        return prev_sample

onnx_export(
    Scheduler_step941(alphas_cumprod, final_alpha_cumprod),
    model_args=(
        noise_pred, latents, latents, latents
    ),
    output_path = Path(f'{save_path}/scheduler_2/sche941_origin.onnx'),
    ordered_input_names=["model_output", "sample", "ets0", "ets1"],
    output_names=["prev_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "model_output"  : {0: "batch"},
        "sample"        : {0: "batch"},
        "ets0"          : {0: "batch"},
        "ets1"          : {0: "batch"},
    },
    opset=12,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)

! python -m onnxruntime.quantization.preprocess --input '../onnx_models_cuda/scheduler_2/sche941_origin.onnx' --output '../onnx_models_cuda/scheduler_2/sche941_infer.onnx'

# # model quantization
quantize_dynamic(
    model_input     =   f'{save_path}/scheduler_2/sche941_infer.onnx', 
    model_output    =   f'{save_path}/scheduler_2/sche941_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

ONNX export Start🚗
ONNX export Finish🍷


1832.08s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [25]:
class Scheduler_stepNN(nn.Module):
    def __init__(
        self, 
        alphas_cumprod, 
        final_alpha_cumprod
    ):
        super().__init__()
        self.alphas_cumprod = alphas_cumprod.to(device = device)
        self.final_alpha_cumprod = final_alpha_cumprod.to(device = device)

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample
    
    def forward(self, model_output, timestep, sample, ets0, ets1, ets2):
        timestep, prev_timestep = 941, 921
        ets = [ets0, ets1, ets2, model_output]
        model_output = (1 / 24) * (55 * ets[-1] - 59 * ets[-2] + 37 * ets[-3] - 9 * ets[-4])
        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
        return prev_sample

onnx_export(
    Scheduler_stepNN(alphas_cumprod, final_alpha_cumprod),
    model_args=(
        noise_pred,
        torch.tensor([921]).to(device = device),
        latents, latents, latents, latents
    ),
    output_path = Path(f'{save_path}/scheduler_2/scheNN_origin.onnx'),
    ordered_input_names=["model_output", "timestep", "sample", "ets0", "ets1", "ets2"],
    output_names=["prev_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "model_output"  : {0: "batch"},
        "sample"        : {0: "batch"},
        "ets0"          : {0: "batch"},
        "ets1"          : {0: "batch"},
        "ets2"          : {0: "batch"},
    },
    opset=12,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)

! python -m onnxruntime.quantization.preprocess --input '../onnx_models_cuda/scheduler_2/scheNN_origin.onnx' --output '../onnx_models_cuda/scheduler_2/scheNN_infer.onnx'

# # model quantization
quantize_dynamic(
    model_input     =   f'{save_path}/scheduler_2/scheNN_infer.onnx', 
    model_output    =   f'{save_path}/scheduler_2/scheNN_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

ONNX export Start🚗
ONNX export Finish🍷


2056.70s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


-----
#### 💛 Conversion

In [26]:
!du -sh ../onnx_models_cuda/scheduler_2/**_origin.onnx

2083.73s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


4.0K	../onnx_models_cuda/scheduler_2/sche941_origin.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche9611_origin.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche9612_origin.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche981_origin.onnx
4.0K	../onnx_models_cuda/scheduler_2/scheNN_origin.onnx


In [27]:
!du -sh ../onnx_models_cuda/scheduler_2/**_quant.onnx

2089.02s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


4.0K	../onnx_models_cuda/scheduler_2/sche941_quant.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche9611_quant.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche9612_quant.onnx
4.0K	../onnx_models_cuda/scheduler_2/sche981_quant.onnx
4.0K	../onnx_models_cuda/scheduler_2/scheNN_quant.onnx


---
#### 💚 ONNX-Runtime Test

In [23]:
import onnxruntime as ort

# Load the ONNX model
onnx_model_path = f'{save_path}/unet/upre_quant.onnx'
session_pre = ort.InferenceSession(onnx_model_path, providers=['AzureExecutionProvider'])
session_down = ort.InferenceSession(onnx_model_path, providers=['AzureExecutionProvider'])

In [25]:
# test running
ort_inputs  = {}
latents = session_pre.run(None, {
    'input': torch.randn([1, 4, 64, 64]).to(dtype = dtype).numpy()
})[0]