In [1]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from converter.converter import sentence_transformers_onnx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the model
model = SentenceTransformer.load('./results/domain_adaptation_model')

In [3]:
# Transform the model to use onnx format
onnx_model = sentence_transformers_onnx(
    model,
    output_path="triton/model_repository/domain_adapter/1/model",
    config_path="results/domain_adaptation_model",
    device=torch.device("cpu")
)
onnx_model

  "token_embeddings": torch.Tensor(hidden_state[0]),
  "attention_mask": torch.Tensor(attention_mask),
  if sentence_embedding.shape[0] == 1:


SentenceTransformerModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, e

In [4]:
# Compute embeddings for two textual contents and compute dot product
with torch.no_grad():
    tokens_1 = model.tokenize(["Composable Lightweight Processors"])
    embedding_1 = onnx_model(tokens_1["input_ids"], tokens_1["attention_mask"], tokens_1["token_type_ids"]).detach().numpy()
    
    tokens_2 = model.tokenize(["ocean"])
    embedding_2 = onnx_model(tokens_2["input_ids"], tokens_2["attention_mask"], tokens_2["token_type_ids"]).detach().numpy()

# Same results as Pytorch-based model - so conversion seems accurate
np.dot(embedding_1, embedding_2)

-0.13528061

In [5]:
import numpy as np
from torchvision import transforms
from PIL import Image
import tritonclient.http as httpclient
from tritonclient.utils import triton_to_np_dtype

# tokenize sentence
sentence=["Composable Lightweight Processors"]
inputs = model.tokenize(sentence)

input_ids = inputs['input_ids'].numpy()
token_type_ids = inputs['token_type_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
input_ids

array([[  101,  4012,  6873, 19150, 12038, 18017,   102]])

In [6]:
import onnx
model = onnx.load(r"triton/model_repository/domain_adapter/1/model.onnx")

# model is an onnx model
graph = model.graph
# graph inputs
for input_name in graph.input:
    print(input_name)
# graph outputs
for output_name in graph.output:
    print(output_name)

name: "input_ids"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch_size"
      }
      dim {
        dim_param: "max_seq_len"
      }
    }
  }
}

name: "attention_mask"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch_size"
      }
      dim {
        dim_param: "max_seq_len"
      }
    }
  }
}

name: "token_type_ids"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch_size"
      }
      dim {
        dim_param: "max_seq_len"
      }
    }
  }
}

name: "1770"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "Gather1770_dim_0"
      }
    }
  }
}



In [7]:
# Setting up client
client = httpclient.InferenceServerClient(url="localhost:8000")

input_ids_triton = httpclient.InferInput("input_ids", input_ids.shape, datatype="INT64")
input_ids_triton.set_data_from_numpy(input_ids.astype(np.int64))

token_type_ids_triton = httpclient.InferInput("token_type_ids", token_type_ids.shape, datatype="INT64")
token_type_ids_triton.set_data_from_numpy(token_type_ids.astype(np.int64))

attention_mask_triton = httpclient.InferInput("attention_mask", attention_mask.shape, datatype="INT64")
attention_mask_triton.set_data_from_numpy(attention_mask.astype(np.int64))

output = httpclient.InferRequestedOutput("1770")

# Querying the server
results = client.infer(model_name="domain_adapter", inputs=[input_ids_triton, token_type_ids_triton, attention_mask_triton], outputs=[output])
results

<tritonclient.http._infer_result.InferResult at 0x72add6f85420>

In [8]:
inference_output = results.as_numpy('1770')
inference_output

array([ 1.52750462e-02,  2.90051084e-02,  8.03524330e-02,  2.03186739e-02,
       -8.10070485e-02, -4.72998135e-02, -1.10489026e-01, -4.12289761e-02,
       -1.34759089e-02, -4.28274386e-02,  9.59829148e-03, -4.44858335e-02,
       -1.08630229e-02, -2.58579087e-02,  6.40803948e-02, -1.47565454e-01,
        4.42228206e-02,  4.39664647e-02,  6.85295537e-02,  5.24679013e-02,
       -3.27280760e-02, -8.71130005e-02, -3.72967981e-02,  3.93275321e-02,
        3.54713574e-02,  3.22213285e-02,  2.48689000e-02, -4.15771417e-02,
        1.17586717e-01,  2.10608263e-02, -1.59455277e-02, -3.34884822e-02,
       -4.31342572e-02, -1.00450004e-02,  5.43379895e-02,  2.15828903e-02,
        3.05772736e-03, -2.57921182e-02, -4.77049947e-02, -1.08501181e-01,
        1.15717724e-02,  3.36790979e-02,  2.48274282e-02,  6.98414892e-02,
        5.24340160e-02,  6.22562170e-02,  4.54524904e-02,  2.85202339e-02,
       -4.66927923e-02, -3.11973747e-02,  1.38132018e-03,  9.28488374e-02,
        5.40151000e-02, -