In [1]:
# Cell 1: Check Model Structure
import onnx
import onnxruntime as ort

# Load and check the model
model = onnx.load("vit_fairface_best_quantized_nc.onnx")
print("Model inputs:", [input.name for input in model.graph.input])
print("Model outputs:", [output.name for output in model.graph.output])

# Try to create a session to verify
session = ort.InferenceSession(
    "vit_fairface_best_quantized_nc.onnx",
    providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
print("\nAvailable providers:", session.get_providers())
print("Current provider:", session.get_provider_options())

Model inputs: ['pixel_values']
Model outputs: ['logits']





Available providers: ['CPUExecutionProvider']
Current provider: {'CPUExecutionProvider': {}}


VIT INT8 SmoothQuant Applied to base model

In [25]:
import modal
import torch
import torchvision.transforms as transforms
from PIL import Image
import os
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig
from neural_compressor.data import DataLoader
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define Modal stub
stub = modal.App("vit-quantization")

# Define the image for the Modal container
image = modal.Image.debian_slim().pip_install(
    "torch",
    "torchvision",
    "neural-compressor",
    "onnxruntime-gpu",
    "pillow",
    "onnx"
)

# Create volumes to store our data
fairface_volume = modal.Volume.from_name("fairface-data", create_if_missing=True)
output_volume = modal.Volume.from_name("vit-quantization-volume", create_if_missing=True)

@stub.function(image=image, gpu="T4", volumes={"/fairface": fairface_volume, "/output": output_volume})
def quantize_model(model_path: str, output_path: str):
    logger.info("Starting quantization process...")
    start_time = time.time()

    # Check if model exists in the volume
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
    else:
        logger.info(f"Found model file at {model_path}, size: {os.path.getsize(model_path)} bytes")

    def create_dataloader(data_dir, batch_size=1):
        logger.info(f"Creating dataloader from directory: {data_dir}")
        if not os.path.exists(data_dir):
            raise FileNotFoundError(f"Calibration directory not found at {data_dir}")
            
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                              std=[0.229, 0.224, 0.225])
        ])
        
        class Dataset:
            def __init__(self, data_dir, transform):
                self.data_dir = data_dir
                self.transform = transform
                self.image_files = [f for f in os.listdir(data_dir) 
                                  if f.endswith(('.jpg', '.jpeg', '.png', '.webp'))]
                logger.info(f"Found {len(self.image_files)} images for calibration")
                if len(self.image_files) == 0:
                    raise ValueError(f"No calibration images found in {data_dir}")
            
            def __len__(self):
                return len(self.image_files)
            
            def __getitem__(self, idx):
                image_path = os.path.join(self.data_dir, self.image_files[idx])
                image = Image.open(image_path).convert('RGB')
                image = self.transform(image)
                return image, 0  # Dummy label
        
        dataset = Dataset(data_dir, transform)
        return DataLoader(framework='pytorch', dataset=dataset, batch_size=batch_size)

    # Create dataloader for calibration
    calibration_dir = "/fairface/calibration_images"
    logger.info("Creating calibration dataloader...")
    dataloader = create_dataloader(calibration_dir)
    
    # Configure quantization with simpler settings
    logger.info("Configuring quantization parameters...")
    conf = PostTrainingQuantConfig(
        device='gpu',
        backend='onnxrt_cuda_ep',
        approach='static',
        calibration_sampling_size=[500],
        op_type_dict={
            'Conv': {
                'weight': {'dtype': ['int8']},
                'activation': {'dtype': ['uint8']}
            },
            'MatMul': {
                'weight': {'dtype': ['int8']},
                'activation': {'dtype': ['uint8']}
            },
            'Gemm': {
                'weight': {'dtype': ['int8']},
                'activation': {'dtype': ['uint8']}
            }
        },
        recipes={
            'smooth_quant': True,
            'smooth_quant_args': {
                'alpha': 0.3,
                'folding': True
            },
            'optypes_to_exclude_output_quant': [
                'Softmax',
                'LayerNormalization',
                'Attention',
                'Div',
                'Mul',
                'Add'
            ],
            'add_qdq_pair_to_weight': True,
            'first_conv_or_matmul_quantization': False,
            'last_conv_or_matmul_quantization': False,
            'pre_post_process_quantization': False
        }
    )
    
    # Perform quantization
    logger.info("Starting model quantization...")
    q_model = quantization.fit(
        model=model_path,
        conf=conf,
        calib_dataloader=dataloader
    )
    
    if q_model is None:
        raise RuntimeError("Quantization failed - no model was returned")
    
    # Save the quantized model
    logger.info(f"Saving quantized model to: {output_path}")
    q_model.save(output_path)
    
    end_time = time.time()
    logger.info(f"Quantization completed in {end_time - start_time:.2f} seconds")
    return output_path

# Run the quantization
with stub.run():
    logger.info("Starting quantization process...")
    try:
        # Run quantization using the model from the volume
        remote_output_path = quantize_model.remote(
            "/output/model.onnx",  # Path to the model in the volume
            "/output/quantized_model.onnx"
        )
        
        # Download the result
        logger.info("Downloading quantized model...")
        with open("vit_fairface_best_quantized_nc.onnx", 'wb') as f:
            f.write(output_volume.read_file("/output/quantized_model.onnx"))
        
        logger.info("Quantization successful! Model saved to: vit_fairface_best_quantized_nc.onnx")
    except Exception as e:
        logger.error(f"Error during process: {str(e)}")
        raise

INFO:__main__:Starting quantization process...
INFO:__main__:Downloading quantized model...
ERROR:__main__:Error during process: a bytes-like object is required, not 'generator'


TypeError: a bytes-like object is required, not 'generator'

Ignore above error , The code still works!

Basic INT8 Quantization of Swin Model

In [3]:
import modal
import os
from PIL import Image
import numpy as np
from torchvision import transforms
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType

stub = modal.App("swin-onnx-int8-quant")

image = modal.Image.debian_slim().pip_install(
    "torch",
    "torchvision",
    "onnxruntime-gpu",
    "onnx",
    "pillow"
)

fairface_volume = modal.Volume.from_name("fairface-data", create_if_missing=True)
output_volume = modal.Volume.from_name("vit-quantization-volume", create_if_missing=True)
swinv2_model_volume = modal.Volume.from_name("swinv2-models", create_if_missing=True)

class ImageFolderDataReader(CalibrationDataReader):
    def __init__(self, image_dir, input_name="input"):
        self.image_paths = [
            os.path.join(image_dir, f)
            for f in os.listdir(image_dir)
            if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))
        ]
        self.transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.idx = 0
        self.input_name = input_name

    def get_next(self):
        if self.idx >= len(self.image_paths):
            return None
        image = Image.open(self.image_paths[self.idx]).convert('RGB')
        input_tensor = self.transform(image).unsqueeze(0).numpy()
        self.idx += 1
        return {self.input_name: input_tensor}

    def rewind(self):
        self.idx = 0

@stub.function(
    image=image,
    gpu="T4",
    timeout=1800,
    volumes={
        "/fairface": fairface_volume,
        "/output": output_volume,
        "/models": swinv2_model_volume,
    }
)
def quantize_swin_onnx(
    onnx_model_path="/models/swin_fairface_best.onnx",
    quantized_model_path="/output/swin_fairface_best_int8.onnx",
    calibration_dir="/fairface/calibration_images"
):
    import onnx
    # Get the correct input name for your model
    m = onnx.load(onnx_model_path)
    input_name = m.graph.input[0].name

    data_reader = ImageFolderDataReader(calibration_dir, input_name=input_name)
    print(f"Quantizing {onnx_model_path} to {quantized_model_path} using {len(data_reader.image_paths)} calibration images...")

    quantize_static(
        model_input=onnx_model_path,
        model_output=quantized_model_path,
        calibration_data_reader=data_reader,
        weight_type=QuantType.QInt8,
        activation_type=QuantType.QUInt8
     
    )
    print(f"Quantized model saved to {quantized_model_path}")

# Run the quantization
with stub.run():
    quantize_swin_onnx.remote()