In [1]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [2]:
!pip install "optimum[onnxruntime]==1.5.0" transformers evaluate mkl-include mkl --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optimum[onnxruntime]==1.5.0
  Downloading optimum-1.5.0-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mkl-include
  Downloading mkl_include-2023.0.0-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting mkl
  Downloading mkl-2023.0.0-py2.py3-none-manylinux1_x86_64.whl (254.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting coloredlogs
  Downloading coloredlogs-15

In [14]:
from sentence_transformers import SentenceTransformer
import time

sentence = 'Royal Bank of Canada is a Canadian multinational financial services company and the largest bank in Canada by market capitalization. The bank serves over 17 million clients and has more than 89,000 employees worldwide'

model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
v_st = time.time() * 1000
vanilla_emb = model.encode(sentence)
v_et = time.time() * 1000
print('time taken for vanilla : {}'.format(v_et - v_st))


time taken for vanilla : 132.3349609375


In [12]:
from transformers import Pipeline
import torch.nn.functional as F
import torch

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


In [5]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path

onnx_path = Path("onnx")

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1')




Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [15]:
# load optimized model
model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_optimized_quantized.onnx")

# create optimized pipeline
optimized_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

o_st = time.time() * 1000
optimized_emb_pred = optimized_emb(sentence)
o_et =  time.time() * 1000

print('time taken for optimized quantized model: {}'.format(o_et - o_st))

time taken for optimized quantized model: 52.700439453125


In [16]:
optimized_emb_pred

tensor([[-3.7310e-02,  5.5734e-02,  9.1432e-03,  2.4154e-02, -2.0351e-02,
         -3.1886e-02,  3.0823e-03,  5.3715e-02, -2.6739e-02, -1.1252e-02,
         -2.5986e-02,  9.7549e-02, -1.7449e-02, -4.3857e-02, -7.5206e-02,
          2.2248e-02, -5.5003e-02,  1.7782e-02,  2.8467e-02, -2.8150e-02,
          1.8533e-02, -1.6071e-02, -1.8746e-02, -7.2122e-03, -2.7609e-02,
          6.9813e-03,  3.3660e-02,  1.0700e-02, -5.6438e-02,  7.8988e-02,
          1.2906e-02, -5.4309e-03,  1.4703e-02, -6.7639e-02,  3.8255e-02,
         -5.6964e-02, -5.1040e-03,  1.2940e-02,  5.4253e-02,  3.3922e-03,
          2.0413e-02, -7.8722e-03,  2.0498e-02, -1.5117e-02,  2.3007e-02,
          2.7469e-02, -1.7764e-02,  6.6413e-03, -6.3284e-03, -3.6325e-02,
         -2.9307e-02, -8.9882e-03,  9.2503e-03, -1.0362e-03,  3.9264e-02,
          6.7627e-04, -6.5784e-02,  6.5745e-03, -2.2015e-02,  9.6801e-03,
          4.4603e-02, -9.3227e-03, -2.0392e-02,  6.3533e-02, -1.3131e-02,
          1.7636e-02,  4.7145e-02, -3.

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
cosine_similarity([vanilla_emb],optimized_emb_pred)

array([[0.98724216]], dtype=float32)