## Benchmark Onnx

In [3]:
import numpy as np
import torch
import transformers
import os
from pathlib import Path
import time
import numpy as np
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig
from transformers import AutoTokenizer, Pipeline
import torch
import torch.nn.functional as F
import pdb
from tqdm import tqdm
import utils

In [4]:
model_name = "sentence-transformers/all-MiniLM-L12-v2"
cache_dir = "onnx_cache"
NUM_DOCS=100

## Create Onnx Model

In [5]:

def process_and_optimize_model(model_name, cache_dir):
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    # Load vanilla transformers and convert to onnx
    model = ORTModelForFeatureExtraction.from_pretrained(model_name, from_transformers=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Save onnx checkpoint and tokenizer
    model.save_pretrained(cache_dir)
    tokenizer.save_pretrained(cache_dir)
    
    # Optimize the model
    optimizer = ORTOptimizer.from_pretrained(model)
    optimization_config = OptimizationConfig(optimization_level=99)
    
    optimizer.optimize(
        save_dir=cache_dir,
        optimization_config=optimization_config,
    )
    
    # Load optimized model
    optimized_model = ORTModelForFeatureExtraction.from_pretrained(cache_dir, file_name="model_optimized.onnx")
    
    # Quantize the model
    dynamic_quantizer = ORTQuantizer.from_pretrained(optimized_model)
    dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
    
    quantized_model_path = dynamic_quantizer.quantize(
        save_dir=cache_dir,
        quantization_config=dqconfig,
    )
    
    # Load quantized model
    quantized_model = ORTModelForFeatureExtraction.from_pretrained(cache_dir, file_name="model_optimized_quantized.onnx")
    
    return optimized_model, quantized_model, tokenizer

optimized_model, quantized_model, tokenizer = process_and_optimize_model(model_name, cache_dir)


The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.

***** Exporting submodel 1/1: BertModel *****
Using framework PyTorch: 2.2.1
Overriding 1 configuration item(s)
	- use_cache -> False
Optimizing model...
2024-08-25 11:40:01.365701 [W:onnxruntime:, inference_session.cc:1732 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.
Configuration saved in onnx_cache/ort_config.json
Optimized model saved at: onnx_cache (external data format: False; saved all tensor to one file: True)
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise

### Benchmark

In [6]:
# Run speed tests
opt_mean, opt_median, opt_std = utils.run_speed_test(optimized_model, tokenizer, num_docs=NUM_DOCS)
quant_mean, quant_median, quant_std = utils.run_speed_test(quantized_model, tokenizer, num_docs=NUM_DOCS)

print("Optimized Model:")
print(f"Mean: {opt_mean:.4f}s, Median: {opt_median:.4f}s, Std: {opt_std:.4f}s")

print("\nQuantized Model:")
print(f"Mean: {quant_mean:.4f}s, Median: {quant_median:.4f}s, Std: {quant_std:.4f}s")

  0%|          | 0/100 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
100%|██████████| 100/100 [00:24<00:00,  4.08it/s]

Optimized Model:
Mean: 0.2102s, Median: 0.2041s, Std: 0.0229s

Quantized Model:
Mean: 0.2439s, Median: 0.2258s, Std: 0.0631s



