In [3]:
from transformers import AutoTokenizer
from optimum.onnxruntime import (
    AutoOptimizationConfig,
    ORTModelForFeatureExtraction,
    ORTOptimizer,
    ORTQuantizer
)
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig

In [4]:
# Load the tokenizer and export the model to the ONNX format
model_id = "dangvantuan/sentence-camembert-large"
save_dir = "camembert_optimized"

## Optimize

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)

# Load the optimization configuration detailing the optimization we wish to apply
optimization_config = AutoOptimizationConfig.O3()
optimizer = ORTOptimizer.from_pretrained(model)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)

tokenizer_config.json: 100%|████████████████████| 400/400 [00:00<00:00, 755kB/s]
config.json: 100%|█████████████████████████████| 683/683 [00:00<00:00, 3.56MB/s]
sentencepiece.bpe.model: 100%|███████████████| 809k/809k [00:00<00:00, 12.8MB/s]
special_tokens_map.json: 100%|██████████████████| 298/298 [00:00<00:00, 617kB/s]
Framework not specified. Using pt to export to ONNX.
model.safetensors: 100%|███████████████████| 1.35G/1.35G [00:21<00:00, 64.1MB/s]
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.2
Overriding 1 configuration item(s)
	- use_cache -> False
Optimizing model...
Configuration saved in camembert_optimized/ort_config.json
Optimized model saved at: camembert_optimized (external data format: False; saved all tensor to one file: True)


PosixPath('camembert_optimized')

## Quantize

In [7]:
onnx_model = ORTModelForFeatureExtraction.from_pretrained(save_dir)

quantizer = ORTQuantizer.from_pretrained(onnx_model)

dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

model_quantized_path = quantizer.quantize(
    save_dir=save_dir+'_quantized',
    quantization_config=dqconfig,
)

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: camembert_optimized_quantized (external data format: False)
Configuration saved in camembert_optimized_quantized/ort_config.json


## Inference

In [18]:
from optimum.pipelines import pipeline

tokenizer = AutoTokenizer.from_pretrained('camembert_optimized_quantized')

model = ORTModelForFeatureExtraction.from_pretrained('camembert_optimized_quantized')

embedding = pipeline("feature-extraction", model=model, tokenizer=tokenizer, accelerator="ort")

The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [19]:
token_embeddings = embedding('Encode this sentence')

In [20]:
len(token_embeddings)

1

In [21]:
len(token_embeddings[0])

6

In [22]:
len(token_embeddings[0][0])

1024

In [23]:
type(token_embeddings[0][0])

list

In [30]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('camembert_optimized_quantized')

model = ORTModelForFeatureExtraction.from_pretrained('camembert_optimized_quantized')


def generate_embeddings(tokenizer, model, sentences):

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform pooling. In this case, mean pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings

The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [31]:
print("Sentence embeddings:")
sentence_embeddings = generate_embeddings(tokenizer, model, sentences)
print(sentence_embeddings)
print(len(sentence_embeddings[0]))
print(len(sentence_embeddings[1]))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence embeddings:
tensor([[-0.2923,  0.1488, -0.4010,  ...,  0.1985,  0.3377, -0.0103],
        [ 0.0818, -0.0262, -0.2458,  ...,  0.4726, -0.1112,  0.1105]])
1024
1024
