In [None]:
from optimum.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from pathlib import Path
from transformers import AutoTokenizer
from optimum.onnxruntime import AutoOptimizationConfig, ORTModelForFeatureExtraction, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

In [None]:
hf_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Load the tokenizer and export the model to the ONNX format
model_id = "sentence-transformers/all-MiniLM-L6-v2"
save_dir = "fast-all-MiniLM-L6-v2"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)

# Remove all existing files in the save_dir using Path.unlink()
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
for p in save_dir.iterdir():
    p.unlink()

# Load the optimization configuration detailing the optimization we wish to apply
optimization_config = AutoOptimizationConfig.O3()
optimizer = ORTOptimizer.from_pretrained(model)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config, use_external_data_format=True)
model = ORTModelForFeatureExtraction.from_pretrained(save_dir)

tokenizer.save_pretrained(save_dir)
# model.save_pretrained(save_dir)
# model.push_to_hub("new_path_for_directory", repository_id="my-onnx-repo", use_auth_token=True)

In [None]:
# onnx_embed = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2", accelerator="ort")
question = "What's my name?" * 50

In [None]:
onnx_quant_embed = pipeline("feature-extraction", model=model, accelerator="ort")

# Ours

In [None]:
# %%timeit
pred = onnx_quant_embed(question)

In [None]:
chars = len(question)
print(f"Speed: {(chars*10^3)/16.5} char/sec")

# Original

In [None]:
# %%timeit
embeddings = hf_model.encode(question)

In [None]:
chars = len(question)
print(f"Speed: {(chars*10^3)/26.3} char/sec")

In [None]:
import os
import tarfile


def compress(directory_path):
    output_filename = directory_path.name + ".tar.gz"
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(directory_path, arcname=os.path.basename(directory_path))
    return output_filename


compressed_file_name = compress(save_dir)

In [None]:
from google.cloud import storage


def upload(bucket_name, source_file_path):
    storage_client = storage.Client(project="main")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(os.path.basename(source_file_path))

    blob.upload_from_filename(source_file_path)

    print(f"File {source_file_path} uploaded to {bucket_name}.")


upload("qdrant-fastembed", compressed_file_name)

In [None]:
# Remove the directory and compressed file
!rm -rvf {save_dir}
!rm -vf {save_dir}.tar.gz