# Porting to ONNX

This notebook demonstrates how to port models from Transformers/PyTorch package to ONNX. It is based on the [Optimum](https://github.com/huggingface/optimum) library.

## Installation

We use [poetry](https://python-poetry.org/docs/cli) to manage dependencies. To install the dependencies, run:

```bash
poetry install
```

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
from typing import List, Tuple, Any

import numpy as np
import time
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
from optimum.onnxruntime import AutoOptimizationConfig, ORTModelForFeatureExtraction, ORTOptimizer, ORTModel
from optimum.pipelines import pipeline
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the tokenizer and PyTorch model from HuggingFace Transformers
model_id = "BAAI/bge-small-en-v1.5"

hf_model = AutoModel.from_pretrained(model_id)
hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
quantize = True
if quantize:
    repository_id = f"Qdrant/{model_id.split('/')[1]}-onnx-Q"
else:
    repository_id = f"Qdrant/{model_id.split('/')[1]}-onnx"

save_dir = f"local_cache/{repository_id}"

In [4]:
# The input texts can be in any language, not just English.
# Each input text should start with "query: " or "passage: ", even for non-English texts.
# For tasks other than retrieval, you can simply use the "query: " prefix.
multilingual_queries = [
    "query: how much protein should a female eat",
    "query: 南瓜的家常做法",
    "query: भारत का राष्ट्रीय खेल कौन-सा है?",  # Hindi text
    "query: భారత్ దేశంలో రాష్ట్రపతి ఎవరు?",  # Telugu text
    "query: இந்தியாவின் தேசிய கோப்பை எது?",  # Tamil text
    "query: ಭಾರತದಲ್ಲಿ ರಾಷ್ಟ್ರಪತಿ ಯಾರು?",  # Kannada text
    "query: ഇന്ത്യയുടെ രാഷ്ട്രീയ ഗാനം എന്താണ്?",  # Malayalam text
]

english_texts = [
    "India: Where the Taj Mahal meets spicy curry.",
    "Machine Learning: Turning data into knowledge, one algorithm at a time.",
    "Python: The language that makes programming a piece of cake.",
    "fastembed: Accelerating embeddings for lightning-fast similarity search.",
    "Qdrant: The ultimate tool for high-dimensional indexing and search.",
]

In [5]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def hf_embed(model_id: str, inputs: List[str]):
    # Tokenize the input texts
    batch_dict = hf_tokenizer(inputs, max_length=512, padding=True, truncation=True, return_tensors="pt")

    outputs = hf_model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.detach().numpy()

In [6]:
hf_embed(inputs=english_texts, model_id=model_id)

array([[ 0.05485763,  0.08136623, -0.00395789, ...,  0.02512371,
        -0.03349504, -0.0593129 ],
       [ 0.01078518,  0.01582215,  0.04614557, ..., -0.01674951,
        -0.00244641, -0.06179965],
       [-0.06607923, -0.01235531, -0.00689854, ...,  0.10634594,
         0.12025263, -0.05135345],
       [-0.07568254,  0.00908228, -0.02221818, ...,  0.00177038,
        -0.0325426 ,  0.05233581],
       [-0.07008213,  0.02070545,  0.02720274, ..., -0.01158645,
        -0.01457597,  0.01262206]], dtype=float32)

## Load the model using ORTModelForFeatureExtraction

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)

# Remove all existing files in the save_dir using Path.unlink()
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
for p in save_dir.iterdir():
    p.unlink()

# Load the optimization configuration detailing the optimization we wish to apply
optimization_config = AutoOptimizationConfig.O4()
optimizer = ORTOptimizer.from_pretrained(model)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config, use_external_data_format=True)
model = ORTModelForFeatureExtraction.from_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.2
Overriding 1 configuration item(s)
	- use_cache -> False
The argument use_external_data_format in the ORTOptimizer.optimize() method is deprecated and will be removed in optimum 2.0.
Optimizing model...
There is no gpu for onnxruntime to do optimization.
Configuration saved in local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/ort_config.json
Optimized model saved at: local_cache/Qdrant/bge-small-en-v1.5-onnx-Q (external data format: False; saved all tensor to one file: True)


('local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/tokenizer_config.json',
 'local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/special_tokens_map.json',
 'local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/vocab.txt',
 'local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/added_tokens.json',
 'local_cache/Qdrant/bge-small-en-v1.5-onnx-Q/tokenizer.json')

In [10]:
model.push_to_hub(save_directory=save_dir, repository_id=repository_id, use_auth_token=True)

model_optimized.onnx: 100%|██████████| 66.5M/66.5M [00:09<00:00, 7.37MB/s]


## Trying out the model from Huggingface Hub

In [14]:
onnx_model = ORTModelForFeatureExtraction.from_pretrained(repository_id)
onnx_tokenizer = AutoTokenizer.from_pretrained(repository_id)

In [15]:
onnx_quant_embed = pipeline("feature-extraction", model=onnx_model, accelerator="ort", tokenizer=onnx_tokenizer,return_tensors=True)
embeddings = onnx_quant_embed(inputs=english_texts)
F.normalize(embeddings[4])[:,0], english_texts[4], len(embeddings), len(english_texts)

(tensor([[-0.2167,  0.0514,  0.0928,  0.1594,  0.2467,  0.3481, -0.0795,  0.1916,
           0.2227, -0.1297,  0.2020, -0.1873,  0.2221,  0.3651,  0.2194, -0.0692,
           0.1239,  0.2137,  0.0195, -0.2582,  0.2084, -0.1736, -0.0366, -0.2664,
          -0.2339,  0.2233, -0.0657, -0.2686, -0.2866, -0.2278,  0.0309,  0.0677,
           0.2661,  0.1537, -0.0069, -0.3319, -0.3038,  0.2219,  0.3027, -0.2240,
          -0.0523,  0.1749, -0.2705,  0.1487, -0.3244, -0.2069, -0.2114, -0.1821,
          -0.1516,  0.2255, -0.2053, -0.2625, -0.0964,  0.3533,  0.2315,  0.1583,
           0.2405, -0.1198, -0.2908,  0.0707,  0.1949,  0.2105, -0.1731,  0.2771,
           0.2203, -0.1494,  0.0959, -0.1590, -0.1761, -0.0311,  0.3467,  0.2385,
           0.0964,  0.1245,  0.0470, -0.1691, -0.1228, -0.2064, -0.1982, -0.2398,
           0.0165,  0.0306, -0.1663, -0.0887, -0.1120, -0.2306,  0.1256, -0.2352,
           0.1686, -0.4168, -0.1018, -0.1619, -0.1757, -0.3001, -0.2155, -0.2885,
           0.186