Skip to content

Commit

Permalink
Fix bug: Include mean pooling for quantized models
Browse files Browse the repository at this point in the history
* Using a transformers pipeline with mean pooling prior to optimization allows us to generate similar quality embeddings as the original
* The model is still the same size, but the similarities it predicts are now much more similar to the un-optimized model
  • Loading branch information
prrao87 committed Apr 24, 2023
1 parent db183bd commit ce16bcd
Showing 1 changed file with 95 additions and 48 deletions.
143 changes: 95 additions & 48 deletions dbs/qdrant/onnx_model/onnx_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,67 @@
import os
"""
This script is a modified version of the method shown in this blog post:
https://www.philschmid.de/optimize-sentence-transformers
It uses the ONNX Runtime to dynamically optimize and quantize a sentence transformers model for better CPU performance.
Using the quantized version of `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` allows us to:
* Generate similar quality sentence embeddings as the original model, but with a roughly 1.8x speedup in vectorization time
* Reduce the model size from 86 MB to around 63 MB, a roughly 26% reduction in file size
"""
from pathlib import Path

import torch
import torch.nn.functional as F
from optimum.onnxruntime import ORTModelForCustomTasks, ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig
from optimum.pipelines import pipeline
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer, Pipeline


def convert_to_onnx(model_id: str, output_dir: str) -> Path:
"""
Download Hugging Face model checkpoint and tokenizer, and then
convert to ONNX format and save to disk
"""
model = ORTModelForCustomTasks.from_pretrained(model_id, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Save to local directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
onnx_path = Path(output_dir)
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)
return onnx_path
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[
0
] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)


class SentenceEmbeddingPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
# We don't have any hyperameters to sanitize
preprocess_kwargs = {}
return preprocess_kwargs, {}, {}

def preprocess(self, inputs):
encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
return encoded_inputs

def _forward(self, model_inputs):
outputs = self.model(**model_inputs)
return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

def optimize_onnx_model(onnx_path: Path) -> None:
def postprocess(self, model_outputs):
# Perform mean pooling
sentence_embeddings = mean_pooling(
model_outputs["outputs"], model_outputs["attention_mask"]
)
# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
return sentence_embeddings


def optimize_model(model_id: str, onnx_path: Path) -> None:
"""
Optimize ONNX model for CPU performance
"""
model = ORTModelForCustomTasks.from_pretrained(model_id, export=True)
# Create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(onnx_path)
optimizer = ORTOptimizer.from_pretrained(model)
# Save models to local disk
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)
# Set optimization_level = 99 -> enable all optimizations
optimization_config = OptimizationConfig(optimization_level=99)
# Apply the optimization configuration to the model
Expand All @@ -37,7 +71,7 @@ def optimize_onnx_model(onnx_path: Path) -> None:
)


def quantize_optimized_onnx_model(onnx_path: Path) -> None:
def quantize_optimized_model(onnx_path: Path) -> None:
"""
Quantize an already optimized ONNX model for even better CPU performance
"""
Expand All @@ -51,39 +85,52 @@ def quantize_optimized_onnx_model(onnx_path: Path) -> None:
)


def get_embedding_pipeline(onnx_path, model_filename: str) -> pipeline:
"""
Create a sentence embedding pipeline using the optimized ONNX model
"""
# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(onnx_path)
optimized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename)
embedding_pipeline = pipeline("feature-extraction", model=optimized_model, tokenizer=tokenizer)
return embedding_pipeline
def generate_similarities(source_sentence: str, sentences: list[str], pipeline: Pipeline) -> None:
source_sentence_embedding = pipeline(source_sentence).tolist()[0]

for sentence in sentences:
sentence_embedding = pipeline(sentence).tolist()[0]
similarity = cosine_similarity([source_sentence_embedding], [sentence_embedding])[0]
print(f"Similarity between '{source_sentence}' and '{sentence}': {similarity}")

def main(embedding_pipeline: pipeline, text: str) -> None:

def main() -> None:
"""
Generate sentence embeddings for the given text using optimized ONNX model
Generate optimized and quantized ONNX models from a vanilla sentence transformer model
"""
embedding = embedding_pipeline(text)[0][0]
print(embedding[:10])
print(f"Generated embedding of length {len(embedding)} from '{model_id}'")
# Init vanilla sentence transformer pipeline
print("---\nLoading vanilla sentence transformer model\n---")
vanilla_pipeline = SentenceEmbeddingPipeline(model=vanilla_model, tokenizer=tokenizer)
# Print out pairwise similarities
generate_similarities(source_sentence, sentences, vanilla_pipeline)

# Save model to ONNX
Path("onnx").mkdir(exist_ok=True)
onnx_path = Path("onnx")

# First, dynamically optimize an existing sentence transformer model
optimize_model(model_id, onnx_path)
# Next, dynamically quantize the optimized model
quantize_optimized_model(onnx_path)

# Init quantized ONNX pipeline
print("---\nLoading quantized ONNX model\n---")
model_filename = "model_optimized_quantized.onnx"
quantized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename)
quantized_pipeline = SentenceEmbeddingPipeline(model=quantized_model, tokenizer=tokenizer)
# Print out pairwise similarities
generate_similarities(source_sentence, sentences, quantized_pipeline)


if __name__ == "__main__":
text = "This is a fabulous wine with a smooth and fruity finish."
model_id = os.environ.get(
"EMBEDDING_MODEL_CHECKPOINT",
"sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
)
output_dir = "onnx"
onnx_path = convert_to_onnx(model_id, output_dir)
# First, optimize the ONNX model
optimize_onnx_model(onnx_path)
# Next, quantize the optimized ONNX model
quantize_optimized_onnx_model(onnx_path)
embedding_pipeline = get_embedding_pipeline(
onnx_path, model_filename="model_optimized_quantized.onnx"
)
main(embedding_pipeline, text)

# Example sentences we want sentence embeddings for
source_sentence = "I'm very happy"
sentences = ["I am so glad", "I'm so sad", "My dog is missing", "The universe is so vast!"]

model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_id)
vanilla_model = AutoModel.from_pretrained(model_id)

main()

0 comments on commit ce16bcd

Please sign in to comment.