In [6]:
import torch
import torch.nn as nn
from diffusers.schedulers import PNDMScheduler
from pathlib import Path
from diffusers import DiffusionPipeline
from util import onnx_export
from onnxruntime.quantization.quantize import quantize_dynamic
from onnxruntime.quantization import QuantType
import os 
import gc
gc.collect()

2082

In [2]:
device = "cpu"
dtype = torch.float32

pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)

---
## Scheduler

In [3]:
def onnx_conver_NN_quant(model, KEY):
    onnx_export(
        model,
        model_args=(
            torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
            torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
            torch.randn([1,4,64,64]).to(device=device, dtype=dtype),    # e2
            torch.randn([1,4,64,64]).to(device=device, dtype=dtype),     # e3
            torch.randn([1,4,64,64]).to(device=device, dtype=dtype)     # e4  
        ),
        output_path = Path(f'../onnx-models/Schedulers/step-{KEY}.onnx'),
        ordered_input_names=["noise_pred", "latents", "ets1", "ets2", "ets3"],
        output_names=["prev_cur_latents"],  # has to be different from "sample" for correct tracing
        dynamic_axes={ 
            "ets": {0: "batch"}
        },
        opset=14,
        use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
    )
    quantize_dynamic(
        model_input = f'../onnx-models/Schedulers/step-{KEY}.onnx',
        model_output=f'../onnx-models/Schedulers-quant/step-{KEY}.onnx',
        per_channel=False,
        reduce_range=False,
        weight_type=QuantType.QUInt8,
    )

`t` = 981

In [9]:
class Scheduler_step981(nn.Module):
    def __init__(self, scheduler, device):
        super().__init__()
        self.scheduler = scheduler
        self.scheduler.set_timesteps(50, device=device)
        self.alphas_cumprod = scheduler.alphas_cumprod
        self.final_alpha_cumprod = scheduler.final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample

    def forward(self, noise_pred, latents):
        cur_sample = latents
        prev_sample = self._get_prev_sample(latents, 981, 961, noise_pred)
        return prev_sample, cur_sample

os.makedirs(f'../onnx-models/Schedulers_t/', exist_ok=True)
os.makedirs(f'../onnx-models/Schedulers-quant/', exist_ok=True)

KEY = 981
onnx_export(
    Scheduler_step981(pipeline.scheduler, device),
    model_args=(
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
    ),
    output_path = Path(f'../onnx-models/Schedulers_t/step-{KEY}.onnx'),
    ordered_input_names=["noise_pred", "latents"],
    output_names=["prev_cur_latents", "cur_sample"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "noise_pred": {0: "batch"}
    },
    opset=14,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)
quantize_dynamic(
    model_input=f'../onnx-models/Schedulers_t/step-{KEY}.onnx',
    model_output=f'../onnx-models/Schedulers-quant/step-{KEY}.onnx',
    per_channel=False,
    reduce_range=False,
    weight_type=QuantType.QUInt8,
)



ONNX export Start🚗
ONNX export Finish🍷


`t` = 961

In [10]:
class Scheduler_step961_1(nn.Module):
    def __init__(self, scheduler, device):
        super().__init__()
        self.scheduler = scheduler
        self.scheduler.set_timesteps(50, device=device)
        self.alphas_cumprod = scheduler.alphas_cumprod
        self.final_alpha_cumprod = scheduler.final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )
        return prev_sample    

    def forward(self, noise_pred, cur_sample, ets):
        model_output = (noise_pred + ets) / 2
        prev_sample = self._get_prev_sample(cur_sample, 981, 961, model_output)
        return prev_sample


class Scheduler_step961_2(nn.Module):
    def __init__(self, scheduler, device):
        super().__init__()
        self.scheduler = scheduler
        self.scheduler.set_timesteps(50, device=device)
        self.alphas_cumprod = scheduler.alphas_cumprod
        self.final_alpha_cumprod = scheduler.final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        #print(timestep, int(timestep), self.alphas_cumprod.shape)
        #timestep_int = int(timestep)
        alpha_prod_t = self.alphas_cumprod[timestep]
        #alpha_prod_t = alpha_prod_t * 1.0
        #kk = self.alphas_cumprod[int(timestep)]
        
        if prev_timestep >= 0:
            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
        else:
            alpha_prod_t_prev = self.final_alpha_cumprod
        
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )

        return prev_sample * (alpha_prod_t // alpha_prod_t)

    def forward(self, noise_pred, sample, ets):
        #prev_sample = self.step_plms(noise_pred, 961, sample, ets_list)
        model_output = (3 * noise_pred - ets) / 2
        prev_sample = self._get_prev_sample(sample, 961, 941, model_output)
        return prev_sample

# --------------------------------------
def onnx_conver_N_quant(model, key):
    onnx_export(
        model,
        model_args=(
                torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
                torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
                torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
        ),
        output_path = Path(f'../onnx-models/Schedulers_t/step-{key}.onnx'),
        ordered_input_names=["noise_pred", "sample", "ets"],
        output_names=["prev_cur_latents"],  # has to be different from "sample" for correct tracing
        dynamic_axes={ 
                "noise_pred": {0: "batch"},
                "sample": {0: "batch"},
                "ets": {0: "batch"}
        },
        opset=14,
        use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
    )
    quantize_dynamic(
        model_input=f'../onnx-models/Schedulers_t/step-{key}.onnx',
        model_output=f'../onnx-models/Schedulers-quant/step-{key}.onnx',
        per_channel=False,
        reduce_range=False,
        weight_type=QuantType.QUInt8,
    )

onnx_conver_N_quant(Scheduler_step961_1(pipeline.scheduler, device), '961_1')
onnx_conver_N_quant(Scheduler_step961_1(pipeline.scheduler, device), '961_2')



ONNX export Start🚗
ONNX export Finish🍷
ONNX export Start🚗
ONNX export Finish🍷


`t` = 941

In [3]:
class Scheduler_step941(nn.Module):
    def __init__(self, scheduler, device):
        super().__init__()
        self.scheduler = scheduler
        self.scheduler.set_timesteps(50, device=device)
        self.alphas_cumprod = scheduler.alphas_cumprod
        self.final_alpha_cumprod = scheduler.final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        #print(timestep, int(timestep), self.alphas_cumprod.shape)
        #timestep_int = int(timestep)
        alpha_prod_t = self.alphas_cumprod[timestep]
        #alpha_prod_t = alpha_prod_t * 1.0
        #kk = self.alphas_cumprod[int(timestep)]
        
        if prev_timestep >= 0:
            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
        else:
            alpha_prod_t_prev = self.final_alpha_cumprod
        
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )

        return prev_sample * (alpha_prod_t // alpha_prod_t)

    def forward(self, noise_pred, sample, ets2, ets3):
        model_output = (23 * noise_pred - 16 * ets2 + 5 * ets3) / 12
        prev_sample = self._get_prev_sample(sample, 941, 921, model_output)
        return prev_sample

onnx_export(
    Scheduler_step941(pipeline.scheduler, device),
    model_args=(
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype),
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype),    # ets1
        torch.randn([1,4,64,64]).to(device=device, dtype=dtype)     # ets2
    ),
    output_path = Path(f'../onnx-models/Schedulers_t/step-{941}.onnx'),
    ordered_input_names=["noise_pred", "sample", "ets1", "ets2"],
    output_names=["prev_cur_latents"],  # has to be different from "sample" for correct tracing
    dynamic_axes={ 
        "noise_pred": {0: "batch"}
    },
    opset=14,
    use_external_data_format=True,  # UNet is > 2GB, so the weights need to be split
)
quantize_dynamic(
    model_input=f'../onnx-models/Schedulers_t/step-{941}.onnx',
    model_output=f'../onnx-models/Schedulers-quant/step-{941}.onnx',
    per_channel=False,
    reduce_range=False,
    weight_type=QuantType.QUInt8,
)



ONNX export Start🚗
ONNX export Finish🍷


In [8]:
import onnxruntime
ort_de = onnxruntime.InferenceSession(f'../onnx-models/Schedulers-quant/step-{941}.onnx')

for i in ort_de.get_inputs():
    print(i.name)

noise_pred
sample
ets1
ets2


`t` = N; 921 ~ 1

In [59]:
class Scheduler_stepNN(nn.Module):
    def __init__(self, scheduler, device):
        super().__init__()
        self.scheduler = scheduler
        self.scheduler.set_timesteps(50, device=device)
        self.alphas_cumprod = scheduler.alphas_cumprod
        self.final_alpha_cumprod = scheduler.final_alpha_cumprod

    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
        #print(timestep, int(timestep), self.alphas_cumprod.shape)
        #timestep_int = int(timestep)
        alpha_prod_t = self.alphas_cumprod[timestep]
        #alpha_prod_t = alpha_prod_t * 1.0
        #kk = self.alphas_cumprod[int(timestep)]
        
        if prev_timestep >= 0:
            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
        else:
            alpha_prod_t_prev = self.final_alpha_cumprod
        
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)
        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )

        return prev_sample * (alpha_prod_t // alpha_prod_t)

    def forward(self, noise_pred, sample, ets2, ets3, ets4):
        model_output = (1 / 24) * (55 * noise_pred - 59 * ets2 + 37 * ets3 - 9 * ets4)
        prev_sample = self._get_prev_sample(sample, KEY, KEY - 20, model_output)
        return prev_sample
 
KEY = 881
# --------------------------------------
onnx_conver_NN_quant(Scheduler_stepNN(pipeline.scheduler, device), KEY)



ONNX export Start🚗
ONNX export Finish🍷


리스트형으로 stack을 쌓는게 앱에서는 안되기 때문에,
- ets가 없는 경우,
- ets에 이미 1개 저장 (t=961_2)
- etst에 이미 2개 저장 (t=941)
- ets에 이미 3개 저장 (t=921 ~ 1)
이 경우에 입력 값을 각각 받도록 하는게 낫다고 생각함