## Load original data and test performance

In [1]:
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import numpy as np
import torch

# Model for which we apply dimensionality reduction
model = SentenceTransformer("BAAI/bge-large-en-v1.5")

# New size for the embeddings
new_dimension = 128

# We use AllNLI as a source of sentences to compute PCA
nli_dataset_path = "datasets/AllNLI.tsv.gz"

# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"

if not os.path.exists(nli_dataset_path):
    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)


# We measure the performance of the original model
print("Read STSbenchmark test dataset")
eval_examples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        if row["split"] == "test":
            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
            eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))

# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")

print("Original model performance:")
stsb_evaluator(model)

Read STSbenchmark test dataset
Original model performance:


0.8751906370474595

In [2]:
model.get_sentence_embedding_dimension()

1024

## Optimized and Quantized model performance

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
from optimum.onnxruntime import (
    ORTModelForFeatureExtraction
)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5_optimized_quantized', model_max_length=512)

optimized_quantized_model = ORTModelForFeatureExtraction.from_pretrained('BAAI/bge-large-en-v1.5_optimized_quantized')

You are using a model of type bert to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
The ONNX file model_optimized_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [4]:
print("Optimized Quantized model performance:")
stsb_evaluator(optimized_quantized_model, tokenizer = tokenizer, optimized_model=True)

Optimized Quantized model performance:


0.8779827351465398

In [11]:
print('Loss in accuracy: {}'.format((0.8751906370474595 - 0.8779827351465398)*100))

Loss in accuracy: -0.279209809908032


## Dimensionality Reduction on original model

In [5]:
######## Reduce the embedding dimensions ########

# Read sentences from NLI dataset
nli_sentences = set()
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        nli_sentences.add(row["sentence1"])
        nli_sentences.add(row["sentence2"])

nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)

# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(
    in_features=model.get_sentence_embedding_dimension(),
    out_features=new_dimension,
    bias=False,
    activation_function=torch.nn.Identity(),
)
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)
# Evaluate the model with the reduce embedding size
print("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)


Model with 128 dimensions:


0.8518175600685338

In [7]:
print('Loss in Accuracy:{}'.format((0.8779827351465398 - 0.8518175600685338)*100))

Loss in Accuracy:2.6165175078005976


In [9]:
type(model)

sentence_transformers.SentenceTransformer.SentenceTransformer

In [10]:
type(optimized_quantized_model)

optimum.onnxruntime.modeling_ort.ORTModelForFeatureExtraction

# Save and Load the model to test for reproducibility

In [None]:
model.save('bge-large-en-v1.5_128')

In [None]:
model_128 = SentenceTransformer('bge-large-en-v1.5_128')

In [None]:
emb_128 = model_128.encode('encode this text')

In [None]:
emb_128

In [None]:
len(emb_128)

## Dimensionality Reduction on optimized quantized model

In [8]:
optimized_quantized_model.add_module("dense", dense)

AttributeError: 'ORTModelForFeatureExtraction' object has no attribute 'add_module'

In [None]:
print("Optimized Quantized Model with {} dimensions:".format(new_dimension))
stsb_evaluator(optimized_quantized_model)