In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import (
    AutoOptimizationConfig,
    ORTModelForFeatureExtraction,
    ORTOptimizer,
    ORTQuantizer
)
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig

In [2]:
# Load the tokenizer and export the model to the ONNX format
model_id = "BAAI/bge-base-zh"
save_dir = "BAAI/bge-base-zh_optimized"

## Optimize

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)

# Load the optimization configuration detailing the optimization we wish to apply
optimization_config = AutoOptimizationConfig.O3()
optimizer = ORTOptimizer.from_pretrained(model)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/940 [00:00<?, ?B/s]

Framework not specified. Using pt to export the model.


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.2.0
Overriding 1 configuration item(s)
	- use_cache -> False
Optimizing model...
Configuration saved in BAAI/bge-base-zh_optimized/ort_config.json
Optimized model saved at: BAAI/bge-base-zh_optimized (external data format: False; saved all tensor to one file: True)


PosixPath('BAAI/bge-base-zh_optimized')

## Quantize

In [5]:
onnx_model = ORTModelForFeatureExtraction.from_pretrained(save_dir)

quantizer = ORTQuantizer.from_pretrained(onnx_model)

dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

model_quantized_path = quantizer.quantize(
    save_dir=save_dir+'_quantized',
    quantization_config=dqconfig,
)

You are using a model of type bert to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: BAAI/bge-base-zh_optimized_quantized (external data format: False)
Configuration saved in BAAI/bge-base-zh_optimized_quantized/ort_config.json


## Inference

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-base-zh_optimized_quantized', model_max_length=512)

model = ORTModelForFeatureExtraction.from_pretrained('BAAI/bge-base-zh_optimized_quantized')


def generate_embeddings(tokenizer, model, sentences):

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform pooling. In this case, mean pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings

You are using a model of type bert to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [8]:
print("Sentence embeddings:")
sentence_embeddings = generate_embeddings(tokenizer, model, sentences)
print(sentence_embeddings)
print(len(sentence_embeddings[0]))
print(len(sentence_embeddings[1]))

Sentence embeddings:
tensor([[ 0.2973,  0.0801,  0.6380,  ..., -0.7224, -0.6210, -0.3201],
        [-0.1563,  0.2827,  0.5223,  ..., -0.8310, -0.6913, -0.2396]])
768
768


In [None]:
!pip install -U sentence-transformers

In [9]:
from sentence_transformers import SentenceTransformer
unoptimized_model = SentenceTransformer(model_id)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
unopt_sent_emb = unoptimized_model.encode(sentences)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
cosine_similarity([sentence_embeddings[1]],[unopt_sent_emb[1]]).item()

0.745256781578064