From ce16bcda09e85f8a15ac14cbf7ba17029fde6e6f Mon Sep 17 00:00:00 2001 From: prrao87 Date: Mon, 24 Apr 2023 19:21:36 +0000 Subject: [PATCH] Fix bug: Include mean pooling for quantized models * Using a transformers pipeline with mean pooling prior to optimization allows us to generate similar quality embeddings as the original * The model is still the same size, but the similarities it predicts are now much more similar to the un-optimized model --- dbs/qdrant/onnx_model/onnx_optimizer.py | 143 ++++++++++++++++-------- 1 file changed, 95 insertions(+), 48 deletions(-) diff --git a/dbs/qdrant/onnx_model/onnx_optimizer.py b/dbs/qdrant/onnx_model/onnx_optimizer.py index 27eb777..761533f 100644 --- a/dbs/qdrant/onnx_model/onnx_optimizer.py +++ b/dbs/qdrant/onnx_model/onnx_optimizer.py @@ -1,33 +1,67 @@ -import os +""" +This script is a modified version of the method shown in this blog post: +https://www.philschmid.de/optimize-sentence-transformers + +It uses the ONNX Runtime to dynamically optimize and quantize a sentence transformers model for better CPU performance. + +Using the quantized version of `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` allows us to: + * Generate similar quality sentence embeddings as the original model, but with a roughly 1.8x speedup in vectorization time + * Reduce the model size from 86 MB to around 63 MB, a roughly 26% reduction in file size +""" from pathlib import Path +import torch +import torch.nn.functional as F from optimum.onnxruntime import ORTModelForCustomTasks, ORTOptimizer, ORTQuantizer from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig -from optimum.pipelines import pipeline -from transformers import AutoTokenizer +from sklearn.metrics.pairwise import cosine_similarity +from transformers import AutoModel, AutoTokenizer, Pipeline -def convert_to_onnx(model_id: str, output_dir: str) -> Path: - """ - Download Hugging Face model checkpoint and tokenizer, and then - convert to ONNX format and save to disk - """ - model = ORTModelForCustomTasks.from_pretrained(model_id, export=True) - tokenizer = AutoTokenizer.from_pretrained(model_id) - # Save to local directory - Path(output_dir).mkdir(parents=True, exist_ok=True) - onnx_path = Path(output_dir) - model.save_pretrained(onnx_path) - tokenizer.save_pretrained(onnx_path) - return onnx_path +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[ + 0 + ] # First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) + + +class SentenceEmbeddingPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + # We don't have any hyperameters to sanitize + preprocess_kwargs = {} + return preprocess_kwargs, {}, {} + + def preprocess(self, inputs): + encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors="pt") + return encoded_inputs + def _forward(self, model_inputs): + outputs = self.model(**model_inputs) + return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]} -def optimize_onnx_model(onnx_path: Path) -> None: + def postprocess(self, model_outputs): + # Perform mean pooling + sentence_embeddings = mean_pooling( + model_outputs["outputs"], model_outputs["attention_mask"] + ) + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + return sentence_embeddings + + +def optimize_model(model_id: str, onnx_path: Path) -> None: """ Optimize ONNX model for CPU performance """ + model = ORTModelForCustomTasks.from_pretrained(model_id, export=True) # Create ORTOptimizer and define optimization configuration - optimizer = ORTOptimizer.from_pretrained(onnx_path) + optimizer = ORTOptimizer.from_pretrained(model) + # Save models to local disk + model.save_pretrained(onnx_path) + tokenizer.save_pretrained(onnx_path) # Set optimization_level = 99 -> enable all optimizations optimization_config = OptimizationConfig(optimization_level=99) # Apply the optimization configuration to the model @@ -37,7 +71,7 @@ def optimize_onnx_model(onnx_path: Path) -> None: ) -def quantize_optimized_onnx_model(onnx_path: Path) -> None: +def quantize_optimized_model(onnx_path: Path) -> None: """ Quantize an already optimized ONNX model for even better CPU performance """ @@ -51,39 +85,52 @@ def quantize_optimized_onnx_model(onnx_path: Path) -> None: ) -def get_embedding_pipeline(onnx_path, model_filename: str) -> pipeline: - """ - Create a sentence embedding pipeline using the optimized ONNX model - """ - # Reload tokenizer - tokenizer = AutoTokenizer.from_pretrained(onnx_path) - optimized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename) - embedding_pipeline = pipeline("feature-extraction", model=optimized_model, tokenizer=tokenizer) - return embedding_pipeline +def generate_similarities(source_sentence: str, sentences: list[str], pipeline: Pipeline) -> None: + source_sentence_embedding = pipeline(source_sentence).tolist()[0] + for sentence in sentences: + sentence_embedding = pipeline(sentence).tolist()[0] + similarity = cosine_similarity([source_sentence_embedding], [sentence_embedding])[0] + print(f"Similarity between '{source_sentence}' and '{sentence}': {similarity}") -def main(embedding_pipeline: pipeline, text: str) -> None: + +def main() -> None: """ - Generate sentence embeddings for the given text using optimized ONNX model + Generate optimized and quantized ONNX models from a vanilla sentence transformer model """ - embedding = embedding_pipeline(text)[0][0] - print(embedding[:10]) - print(f"Generated embedding of length {len(embedding)} from '{model_id}'") + # Init vanilla sentence transformer pipeline + print("---\nLoading vanilla sentence transformer model\n---") + vanilla_pipeline = SentenceEmbeddingPipeline(model=vanilla_model, tokenizer=tokenizer) + # Print out pairwise similarities + generate_similarities(source_sentence, sentences, vanilla_pipeline) + + # Save model to ONNX + Path("onnx").mkdir(exist_ok=True) + onnx_path = Path("onnx") + + # First, dynamically optimize an existing sentence transformer model + optimize_model(model_id, onnx_path) + # Next, dynamically quantize the optimized model + quantize_optimized_model(onnx_path) + + # Init quantized ONNX pipeline + print("---\nLoading quantized ONNX model\n---") + model_filename = "model_optimized_quantized.onnx" + quantized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename) + quantized_pipeline = SentenceEmbeddingPipeline(model=quantized_model, tokenizer=tokenizer) + # Print out pairwise similarities + generate_similarities(source_sentence, sentences, quantized_pipeline) if __name__ == "__main__": - text = "This is a fabulous wine with a smooth and fruity finish." - model_id = os.environ.get( - "EMBEDDING_MODEL_CHECKPOINT", - "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", - ) - output_dir = "onnx" - onnx_path = convert_to_onnx(model_id, output_dir) - # First, optimize the ONNX model - optimize_onnx_model(onnx_path) - # Next, quantize the optimized ONNX model - quantize_optimized_onnx_model(onnx_path) - embedding_pipeline = get_embedding_pipeline( - onnx_path, model_filename="model_optimized_quantized.onnx" - ) - main(embedding_pipeline, text) + + # Example sentences we want sentence embeddings for + source_sentence = "I'm very happy" + sentences = ["I am so glad", "I'm so sad", "My dog is missing", "The universe is so vast!"] + + model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + # Load AutoModel from huggingface model repository + tokenizer = AutoTokenizer.from_pretrained(model_id) + vanilla_model = AutoModel.from_pretrained(model_id) + + main()