In [1]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [2]:
!pip install "optimum[onnxruntime]==1.5.0" transformers evaluate mkl-include mkl --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optimum[onnxruntime]==1.5.0
  Downloading optimum-1.5.0-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mkl-include
  Downloading mkl_include-2023.0.0-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Collecting mkl
  Downloading mkl-2023.0.0-py2.py3-none-manylinux1_x86_64.whl (254.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs
  Downloading coloredlogs-15.0.1-p

In [45]:
from sentence_transformers import SentenceTransformer
import time

sentence = 'Royal Bank of Canada is a Canadian multinational financial services company and the largest bank in Canada by market capitalization. The bank serves over 17 million clients and has more than 89,000 employees worldwide'

model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
v_st = time.time() * 1000
vanilla_emb = model.encode(sentence)
v_et = time.time() * 1000
print('time taken for vanilla : {}'.format(v_et - v_st))


time taken for vanilla : 107.754638671875


In [2]:
from transformers import Pipeline
import torch.nn.functional as F
import torch

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


In [3]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path

onnx_path = Path("onnx")

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')




In [46]:
# load optimized model
model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_optimized_quantized.onnx")

# create optimized pipeline
optimized_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

o_st = time.time() * 1000
optimized_emb_pred = optimized_emb(sentence)
o_et =  time.time() * 1000

print('time taken for optimized quantized model: {}'.format(o_et - o_st))

time taken for optimized quantized model: 28.317138671875


In [47]:
optimized_emb_pred

tensor([[ 4.4777e-02,  1.1825e-03,  6.0481e-03, -1.9312e-02,  1.5138e-02,
          2.6437e-02,  5.5512e-02,  6.1203e-02,  3.9094e-02,  3.6505e-02,
         -1.1903e-01, -4.3564e-03, -2.8887e-02, -5.8724e-02, -5.9878e-02,
         -7.5853e-02,  2.1850e-02, -1.5865e-02, -8.2803e-03,  8.5816e-02,
         -4.1636e-02, -3.0772e-02, -4.2817e-03, -2.1819e-02,  3.4203e-02,
         -3.7289e-02, -6.9369e-02,  2.1012e-02,  1.1867e-03, -2.0010e-02,
          2.0115e-02,  2.4797e-02,  1.5249e-02,  6.2365e-02, -1.3636e-02,
          9.1422e-02, -7.4480e-02, -1.1528e-02,  1.8219e-03,  2.6597e-02,
         -1.6901e-02,  5.8106e-03, -7.0798e-03, -3.7733e-02,  6.6445e-02,
          2.9531e-02, -1.1449e-01,  3.1857e-02, -6.7084e-02,  7.9999e-02,
          5.5763e-02, -7.1333e-02,  4.5483e-02, -8.2652e-02,  2.4699e-02,
         -1.7799e-02, -4.5248e-02, -5.9049e-02,  6.9176e-02,  1.4136e-02,
          1.1544e-01,  4.0786e-02,  1.0315e-02,  3.1071e-02,  8.1621e-02,
          3.9502e-02,  5.0360e-02,  8.

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
cosine_similarity([vanilla_emb],optimized_emb_pred)

array([[0.97197986]], dtype=float32)