## Gradio Space + Dynamic Quantization with ONNX Runtime


In [None]:
!pip3 install gradio onnx onnxruntime


In [None]:
import os
import onnx
import gradio as gr

from onnxruntime.quantization import quantize_dynamic, QuantType


def customize_quantize_onnx_model(
    onnx_model_path,
    weight_type,
    optimize_model,
    save_directory,
    calibration_directory=None,
):
    onnx_model_path = onnx_model_path.name
    quantized_model_path = os.path.join(
        save_directory,
        f"{os.path.splitext(os.path.basename(onnx_model_path))[0]}-quantized.onnx",
    )
    onnx_opt_model = onnx.load(onnx_model_path)

    print("Quantizing...")

    if calibration_directory:
        quantize_dynamic(
            onnx_model_path,
            quantized_model_path,
            weight_type=weight_type,
            optimize_model=optimize_model,
            calibration_path=calibration_directory,
        )
    else:
        quantize_dynamic(
            onnx_model_path,
            quantized_model_path,
            weight_type=weight_type,
            optimize_model=optimize_model,
        )

    print(f"Quantization completed. Quantized model saved to: {quantized_model_path}")

    full_precision_size = os.path.getsize(onnx_model_path) / (1024 * 1024)
    quantized_size = os.path.getsize(quantized_model_path) / (1024 * 1024)

    return (
        f"ONNX full precision model size (MB): {full_precision_size}",
        f"ONNX quantized model size (MB): {quantized_size}",
        f"ONNX quantized model saved to: {quantized_model_path}",
    )


iface = gr.Interface(
    fn=customize_quantize_onnx_model,
    inputs=[
        gr.inputs.File(label="Upload ONNX Model"),
        gr.inputs.Dropdown(
            ["QInt8", "QUInt8", "QInt16", "QUInt16"], label="Weight Type"
        ),
        gr.inputs.Checkbox(label="Optimize Model"),
        gr.inputs.Textbox(label="Save Directory", lines=1),
        gr.inputs.Textbox(label="Calibration Data Directory (optional)", lines=1),
    ],
    outputs=["text", "text"],
    interpretation="default",
    title="ORT Dynamics Quantization Space",
    description="Quantize ONNX models to lower precision using ONNX Runtime + Gradio. This space supports dynamic quantization, which quantizes the model weights at runtime, as well as static quantization, which requires calibration data for precise quantization. Select the weight type (INT8 or INT16), choose whether to optimize the model, and specify the directory to save the quantized model. For static quantization, calibration data will be saved in a separate directory within the specified save directory.",
)

iface.launch()