In [None]:
!pip install -q transformers datasets evaluate onnx onnxruntime

In [None]:
!python -m pip install -q huggingface_hub

In [None]:
from huggingface_hub import notebook_login, create_repo, HfApi
import time
notebook_login()

In [None]:
models = ["InterIIT/Adult_contemporary_music-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Paper-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Catalan_language-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Web_browser-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Pub-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Materialism-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Warsaw_Pact-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Heresy-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Human_Development_Index-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Cardinal_Catholicism-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Canadian_Armed_Forces-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Wayback_Machine-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/2008_Sichuan_earthquake-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/IPod-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/global-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/2-no-train-gen-squad-validation-tuned-all-mpnet-base-v2",
"InterIIT/1-no-train-gen-squad-validation-tuned-all-mpnet-base-v2",
"InterIIT/0-no-train-gen-squad-validation-tuned-all-mpnet-base-v2",
"InterIIT/Unk_DevRev-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Federal_Bureau_of_Investigation-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Biodiversity-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Great_Plains-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Hard_rock-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/United_States_dollar-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Grape-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Immunology-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/The_Times-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Southampton-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Dialect-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Nanjing-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Mary_mother_of_Jesus-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Everton_FC-no-train-gen-tuned-all-mpnet-base-v2",
"InterIIT/Imamah_Shia_doctrine-no-train-gen-tuned-all-mpnet-base-v2"]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import onnxruntime as rt
from typing import Mapping, OrderedDict
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import torch.nn.functional as F
import gc
import numpy as np
import os
from onnxruntime.transformers import optimizer
from tqdm import tqdm

In [None]:
drive_path = "/content/drive/MyDrive/Inter_IIT/Models/embedding_models_onnx"

# ONNX Exporting

In [None]:
for model in tqdm(models[14:][16:]):
  model_name = model.split("/")[-1]
  tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)
  model = AutoModel.from_pretrained(model, use_auth_token=True)
  with torch.no_grad():
      torch.onnx.export(
          model,
          (torch.randint(0, 1000, (1, 384), dtype=torch.int), torch.ones(1, 384, dtype=torch.int)),
          f"{drive_path}/{model_name}.onnx",
          opset_version=12,
          input_names=['input_ids', 'attention_mask'],
          output_names=['last_hidden_state'],
          dynamic_axes={'input_ids': {0: 'batch', 1: 'sequence'}, 'attention_mask': {0: 'batch', 1: 'sequence'}, 'last_hidden_state': {0: 'batch'}},
          do_constant_folding=True,
          export_params=True
      )
  optimized_model = optimizer.optimize_model(f"{drive_path}/{model_name}.onnx", opt_level=99, use_gpu=False, only_onnxruntime=True)
  optimized_model.save_model_to_file(f"{drive_path}/{model_name}_optimized.onnx")
  os.remove(f"{drive_path}/{model_name}.onnx")
  del model, optimized_model
  gc.collect()

# Sample ONNX Inference of Embedding Model

In [None]:
sess_options = rt.SessionOptions()
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_session = rt.InferenceSession("sbert_model_optimized.onnx", 
                                  sess_options, providers=['CPUExecutionProvider'], 
                                  provider_options=[{'device_type' : "GPU_FP16"}])

In [None]:
def normalization(x):
    return x / np.linalg.norm(x, axis=1, keepdims=True)
    
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = np.repeat(np.expand_dims(attention_mask, -1), token_embeddings.shape[-1], axis=-1).astype(np.float32)
    return np.sum(token_embeddings * input_mask_expanded, axis=1) / np.maximum(input_mask_expanded.sum(axis=1), 1e-9)

In [None]:
sentences = [] # sample examples

In [None]:
latency = []
for i in range(100):
  encoded_input = tokenizer(sentences[i], padding=True, truncation=True, max_length=384, return_tensors='np')
  st = time.time()
  outputs = ort_session.run(None, {'input_ids': np.array(encoded_input['input_ids'], dtype=np.int32), 'attention_mask': np.array(encoded_input['attention_mask'], dtype=np.int32)})
  latency.append(time.time()-st)
print(np.mean(latency))

In [None]:
sentence_embeddings = mean_pooling(outputs, encoded_input['attention_mask'])
sentence_embeddings = normalization(sentence_embeddings)
print("Sentence embeddings:")
print(sentence_embeddings)