From ce16bcda09e85f8a15ac14cbf7ba17029fde6e6f Mon Sep 17 00:00:00 2001
From: prrao87 <prrao87@gmail.com>
Date: Mon, 24 Apr 2023 19:21:36 +0000
Subject: [PATCH] Fix bug: Include mean pooling for quantized models * Using a
 transformers pipeline with mean pooling prior to optimization allows us to
 generate similar quality embeddings as the original * The model is still the
 same size, but the similarities it predicts are now much more similar to the
 un-optimized model

---
 dbs/qdrant/onnx_model/onnx_optimizer.py | 143 ++++++++++++++++--------
 1 file changed, 95 insertions(+), 48 deletions(-)

diff --git a/dbs/qdrant/onnx_model/onnx_optimizer.py b/dbs/qdrant/onnx_model/onnx_optimizer.py
index 27eb777..761533f 100644
--- a/dbs/qdrant/onnx_model/onnx_optimizer.py
+++ b/dbs/qdrant/onnx_model/onnx_optimizer.py
@@ -1,33 +1,67 @@
-import os
+"""
+This script is a modified version of the method shown in this blog post:
+https://www.philschmid.de/optimize-sentence-transformers
+
+It uses the ONNX Runtime to dynamically optimize and quantize a sentence transformers model for better CPU performance.
+
+Using the quantized version of `sentence-transformers/multi-qa-MiniLM-L6-cos-v1` allows us to:
+  * Generate similar quality sentence embeddings as the original model, but with a roughly 1.8x speedup in vectorization time
+  * Reduce the model size from 86 MB to around 63 MB, a roughly 26% reduction in file size
+"""
 from pathlib import Path
 
+import torch
+import torch.nn.functional as F
 from optimum.onnxruntime import ORTModelForCustomTasks, ORTOptimizer, ORTQuantizer
 from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig
-from optimum.pipelines import pipeline
-from transformers import AutoTokenizer
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoModel, AutoTokenizer, Pipeline
 
 
-def convert_to_onnx(model_id: str, output_dir: str) -> Path:
-    """
-    Download Hugging Face model checkpoint and tokenizer, and then
-    convert to ONNX format and save to disk
-    """
-    model = ORTModelForCustomTasks.from_pretrained(model_id, export=True)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    # Save to local directory
-    Path(output_dir).mkdir(parents=True, exist_ok=True)
-    onnx_path = Path(output_dir)
-    model.save_pretrained(onnx_path)
-    tokenizer.save_pretrained(onnx_path)
-    return onnx_path
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+
+
+class SentenceEmbeddingPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        # We don't have any hyperameters to sanitize
+        preprocess_kwargs = {}
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs):
+        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
+        return encoded_inputs
 
+    def _forward(self, model_inputs):
+        outputs = self.model(**model_inputs)
+        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}
 
-def optimize_onnx_model(onnx_path: Path) -> None:
+    def postprocess(self, model_outputs):
+        # Perform mean pooling
+        sentence_embeddings = mean_pooling(
+            model_outputs["outputs"], model_outputs["attention_mask"]
+        )
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+        return sentence_embeddings
+
+
+def optimize_model(model_id: str, onnx_path: Path) -> None:
     """
     Optimize ONNX model for CPU performance
     """
+    model = ORTModelForCustomTasks.from_pretrained(model_id, export=True)
     # Create ORTOptimizer and define optimization configuration
-    optimizer = ORTOptimizer.from_pretrained(onnx_path)
+    optimizer = ORTOptimizer.from_pretrained(model)
+    # Save models to local disk
+    model.save_pretrained(onnx_path)
+    tokenizer.save_pretrained(onnx_path)
     # Set optimization_level = 99 -> enable all optimizations
     optimization_config = OptimizationConfig(optimization_level=99)
     # Apply the optimization configuration to the model
@@ -37,7 +71,7 @@ def optimize_onnx_model(onnx_path: Path) -> None:
     )
 
 
-def quantize_optimized_onnx_model(onnx_path: Path) -> None:
+def quantize_optimized_model(onnx_path: Path) -> None:
     """
     Quantize an already optimized ONNX model for even better CPU performance
     """
@@ -51,39 +85,52 @@ def quantize_optimized_onnx_model(onnx_path: Path) -> None:
     )
 
 
-def get_embedding_pipeline(onnx_path, model_filename: str) -> pipeline:
-    """
-    Create a sentence embedding pipeline using the optimized ONNX model
-    """
-    # Reload tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(onnx_path)
-    optimized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename)
-    embedding_pipeline = pipeline("feature-extraction", model=optimized_model, tokenizer=tokenizer)
-    return embedding_pipeline
+def generate_similarities(source_sentence: str, sentences: list[str], pipeline: Pipeline) -> None:
+    source_sentence_embedding = pipeline(source_sentence).tolist()[0]
 
+    for sentence in sentences:
+        sentence_embedding = pipeline(sentence).tolist()[0]
+        similarity = cosine_similarity([source_sentence_embedding], [sentence_embedding])[0]
+        print(f"Similarity between '{source_sentence}' and '{sentence}': {similarity}")
 
-def main(embedding_pipeline: pipeline, text: str) -> None:
+
+def main() -> None:
     """
-    Generate sentence embeddings for the given text using optimized ONNX model
+    Generate optimized and quantized ONNX models from a vanilla sentence transformer model
     """
-    embedding = embedding_pipeline(text)[0][0]
-    print(embedding[:10])
-    print(f"Generated embedding of length {len(embedding)} from '{model_id}'")
+    # Init vanilla sentence transformer pipeline
+    print("---\nLoading vanilla sentence transformer model\n---")
+    vanilla_pipeline = SentenceEmbeddingPipeline(model=vanilla_model, tokenizer=tokenizer)
+    # Print out pairwise similarities
+    generate_similarities(source_sentence, sentences, vanilla_pipeline)
+
+    # Save model to ONNX
+    Path("onnx").mkdir(exist_ok=True)
+    onnx_path = Path("onnx")
+
+    # First, dynamically optimize an existing sentence transformer model
+    optimize_model(model_id, onnx_path)
+    # Next, dynamically quantize the optimized model
+    quantize_optimized_model(onnx_path)
+
+    # Init quantized ONNX pipeline
+    print("---\nLoading quantized ONNX model\n---")
+    model_filename = "model_optimized_quantized.onnx"
+    quantized_model = ORTModelForCustomTasks.from_pretrained(onnx_path, file_name=model_filename)
+    quantized_pipeline = SentenceEmbeddingPipeline(model=quantized_model, tokenizer=tokenizer)
+    # Print out pairwise similarities
+    generate_similarities(source_sentence, sentences, quantized_pipeline)
 
 
 if __name__ == "__main__":
-    text = "This is a fabulous wine with a smooth and fruity finish."
-    model_id = os.environ.get(
-        "EMBEDDING_MODEL_CHECKPOINT",
-        "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
-    )
-    output_dir = "onnx"
-    onnx_path = convert_to_onnx(model_id, output_dir)
-    # First, optimize the ONNX model
-    optimize_onnx_model(onnx_path)
-    # Next, quantize the optimized ONNX model
-    quantize_optimized_onnx_model(onnx_path)
-    embedding_pipeline = get_embedding_pipeline(
-        onnx_path, model_filename="model_optimized_quantized.onnx"
-    )
-    main(embedding_pipeline, text)
+
+    # Example sentences we want sentence embeddings for
+    source_sentence = "I'm very happy"
+    sentences = ["I am so glad", "I'm so sad", "My dog is missing", "The universe is so vast!"]
+
+    model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    # Load AutoModel from huggingface model repository
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    vanilla_model = AutoModel.from_pretrained(model_id)
+
+    main()