In [1]:
!pip install "optimum[onnxruntime]==1.5.0" transformers evaluate mkl-include mkl --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install -U sentence-transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path


model_id="sentence-transformers/all-MiniLM-L12-v2"
onnx_path = Path("onnx")

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)




('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

In [4]:
from transformers import Pipeline
import torch.nn.functional as F
import torch

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


In [5]:
# init pipeline
vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

# run inference
pred = vanilla_emb("Could you assist me in finding my lost card?")

# print an excerpt from the sentence embedding
print(pred)
#     tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])

print(len(pred[0]))


tensor([[-2.1770e-02,  8.1134e-02, -4.0811e-02,  2.9853e-02,  6.7192e-02,
          3.6648e-02,  4.7331e-03,  1.1305e-01, -1.9011e-03, -4.5925e-02,
          6.5390e-03, -5.6702e-02, -3.4476e-02, -7.9789e-02,  1.2206e-02,
         -5.2425e-02, -1.9497e-02, -4.0211e-03,  1.2840e-02,  5.8772e-02,
         -4.1159e-02,  2.3427e-03, -2.7493e-02,  8.3900e-03,  7.0364e-02,
          7.5814e-02, -1.4560e-02, -5.8473e-03, -3.3281e-02,  2.4247e-02,
          4.9999e-02,  1.7458e-02,  1.0750e-01,  2.8686e-04,  1.1806e-01,
         -3.7810e-02,  1.3165e-02,  5.4331e-02,  2.0283e-02,  3.4679e-02,
         -4.6977e-02, -4.0187e-02,  2.5653e-02, -2.2303e-02,  2.0018e-02,
         -3.7515e-02, -8.6277e-02, -9.5655e-03,  1.2010e-01, -5.5634e-02,
          1.3471e-02,  9.5147e-02, -4.1456e-02,  4.0031e-02,  7.5506e-02,
          2.4554e-02,  1.1460e-01, -5.9549e-02, -2.1779e-02, -6.2141e-02,
          1.2133e-01,  2.4002e-03, -2.1592e-02, -2.6819e-02, -5.4920e-02,
         -3.9290e-02, -3.1928e-02, -2.

In [6]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.optimize(
    save_dir=onnx_path,
    optimization_config=optimization_config,
)


PosixPath('onnx')

In [8]:
from optimum.onnxruntime import ORTModelForFeatureExtraction

# load optimized model
model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_optimized.onnx")

# create optimized pipeline
optimized_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
pred = optimized_emb("Could you assist me in finding my lost card?")
print(pred)
#  tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])


tensor([[-2.1770e-02,  8.1134e-02, -4.0811e-02,  2.9853e-02,  6.7192e-02,
          3.6648e-02,  4.7332e-03,  1.1305e-01, -1.9011e-03, -4.5925e-02,
          6.5390e-03, -5.6702e-02, -3.4475e-02, -7.9789e-02,  1.2206e-02,
         -5.2425e-02, -1.9497e-02, -4.0211e-03,  1.2840e-02,  5.8772e-02,
         -4.1159e-02,  2.3426e-03, -2.7493e-02,  8.3899e-03,  7.0364e-02,
          7.5814e-02, -1.4560e-02, -5.8473e-03, -3.3281e-02,  2.4247e-02,
          4.9999e-02,  1.7458e-02,  1.0750e-01,  2.8685e-04,  1.1806e-01,
         -3.7810e-02,  1.3165e-02,  5.4331e-02,  2.0283e-02,  3.4679e-02,
         -4.6977e-02, -4.0187e-02,  2.5653e-02, -2.2303e-02,  2.0018e-02,
         -3.7515e-02, -8.6277e-02, -9.5655e-03,  1.2010e-01, -5.5634e-02,
          1.3471e-02,  9.5147e-02, -4.1456e-02,  4.0031e-02,  7.5506e-02,
          2.4554e-02,  1.1460e-01, -5.9550e-02, -2.1779e-02, -6.2141e-02,
          1.2133e-01,  2.4002e-03, -2.1592e-02, -2.6819e-02, -5.4920e-02,
         -3.9290e-02, -3.1928e-02, -2.

In [9]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)


In [10]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model_optimized.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model_optimized_quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")
#  Model file size: 86.66 MB
#  Quantized Model file size: 63.47 MB


Model file size: 127.29 MB
Quantized Model file size: 81.36 MB


In [12]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer

model = ORTModelForFeatureExtraction.from_pretrained(onnx_path,file_name="model_optimized_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained(onnx_path)

q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

pred = q8_emb("Could you assist me in finding my lost card?")
print(pred[0][:5])
# tensor([-0.0567,  0.0111, -0.0110,  0.0450,  0.0447])


tensor([-0.0120,  0.0775, -0.0461,  0.0266,  0.0737])


In [13]:
from datasets import load_dataset
from evaluate import load

eval_dataset = load_dataset("glue","stsb",split="validation")
metric = load('glue', 'stsb')

# creating a subset for faster evaluation
# COMMENT IN to run evaluation on a subset of the dataset
# eval_dataset = eval_dataset.select(range(200))


Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

In [14]:
def compute_sentence_similarity(sentence_1, sentence_2,pipeline):
    embedding_1 = pipeline(sentence_1)
    embedding_2 = pipeline(sentence_2)
    # compute cosine similarity between two sentences
    return torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=1)


def evaluate_stsb(example):
  default = compute_sentence_similarity(example["sentence1"], example["sentence2"], vanilla_emb)
  quantized = compute_sentence_similarity(example["sentence1"], example["sentence2"], q8_emb)
  return {
      'reference': (example["label"] - 1) / (5 - 1), # rescale to [0,1]
      'default': float(default),
      'quantized': float(quantized),
      }

# run evaluation
result = eval_dataset.map(evaluate_stsb)

# compute metrics
default_acc = metric.compute(predictions=result["default"], references=result["reference"])
quantized = metric.compute(predictions=result["quantized"], references=result["reference"])

print(f"vanilla model: pearson={default_acc['pearson']}%")
print(f"quantized model: pearson={quantized['pearson']}%")
print(f"The quantized model achieves {round(quantized['pearson']/default_acc['pearson'],2)*100:.2f}% accuracy of the fp32 model")




  0%|          | 0/1500 [00:00<?, ?ex/s]

vanilla model: pearson=0.876909304740316%
quantized model: pearson=0.8725579453450062%
The quantized model achieves 100.00% accuracy of the fp32 model


In [15]:
from time import perf_counter
import numpy as np

payload="Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value. I cannot wait to see what is next for me"
print(f'Payload sequence length: {len(tokenizer(payload)["input_ids"])}')

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms


vanilla_model=measure_latency(vanilla_emb)
quantized_model=measure_latency(q8_emb)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")


Payload sequence length: 128
Vanilla model: P95 latency (ms) - 596.793359249909; Average latency (ms) - 192.25 +\- 177.08;
Quantized model: P95 latency (ms) - 146.46594569976514; Average latency (ms) - 108.61 +\- 26.52;
Improvement through quantization: 4.07x
