# Deploying Models in Triton

The goal of the notebook is to deploy models to Triton and show how to perform high-performance inference for production purposes.

We will be doing following steps as part of the notebook:

- Convert GNN pytorch model to ONNX format
- Deploying models to Triton Inference Server and starting the server
- Submitting the inference requests



## Import Modules

In [1]:
# Import packages
import pandas as pd
import cudf
import cupy as cp

import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
import os
import xgboost as xgb


In [2]:
#Definition of the trained Graph
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, n_hops, dropout_prob=0.25):
        super(GraphSAGE, self).__init__()

        # list of conv layers
        self.convs = nn.ModuleList()
        # add first conv layer to the list
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        # add the remaining conv layers to the list
        for _ in range(n_hops - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        # output layer
        self.fc = nn.Linear(hidden_channels, out_channels)        

    def forward(self, x, edge_index, return_hidden=False):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)
            
        if return_hidden:
            return x
        else:
            return self.fc(x)

# Convert GNN PyTorch model to ONNX

We will be converting the PyTorch model that was created in the training notebook to an ONNX model for deploying to the Triton Inference Server.

In [3]:
dataset_base_path = '../data/TabFormer'
model_root_dir = os.path.join(dataset_base_path, 'models')
gnn_model_path = os.path.join(model_root_dir, 'node_embedder.pth')

In [4]:
#Load the pre-trained GraphSAGE model for generating node embeddings
gnn_model = GraphSAGE(in_channels=74, hidden_channels=32, out_channels=2, n_hops=2)  # Adjust based on your setup
gnn_model=torch.load(gnn_model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# gnn_model = torch.load(gnn_model_path, map_location=torch.device('cuda'))
gnn_model.eval()  # Set the model to evaluation mode


  gnn_model=torch.load(gnn_model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


GraphSAGE(
  (convs): ModuleList(
    (0): SAGEConv(74, 32, aggr=mean)
    (1): SAGEConv(32, 32, aggr=mean)
  )
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

In [5]:
# Evaluate the model on transformed data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dummy_x = torch.rand(4, 74, dtype=torch.float32) 
dummy_x = dummy_x.to(device)

dummy_edge_index = torch.tensor([[0, 1, 2, 3],  # Example edges
                               [1, 2, 3, 2]], dtype=torch.int64).to(device)

In [6]:
onnx_file_path = 'model_repository/model/1/gnn_model.onnx'

torch.onnx.export(
    gnn_model,  # The model to export
    (dummy_x, dummy_edge_index, True),  # Input data (node features, edge indices, hidden=True)
    onnx_file_path,  # Path where the ONNX model will be saved
    export_params=True,  # Export model parameters (weights)
    opset_version=11,  # ONNX opset version (adjust as needed, 11 is generally fine)
    input_names=['l_x_', 'l_edge_index_'], 
    output_names=['output'],  
    dynamic_axes={  
        'l_x_': {0: 'batch_size'},  # Allow the first dimension of 'l_x_' to be dynamic (batch size)
        'l_edge_index_': {0: 'num_edges',  1: 'num_edges'},  # Allow the first and second dimension of 'l_edge_index_' to be dynamic (number of edges)
    },
    verbose=True  
)

Exported graph: graph(%l_x_ : Float(*, 74, strides=[74, 1], requires_grad=0, device=cuda:0),
      %l_edge_index_ : Long(*, *, strides=[4, 1], requires_grad=0, device=cuda:0),
      %convs.0.lin_l.weight : Float(32, 74, strides=[74, 1], requires_grad=1, device=cuda:0),
      %convs.0.lin_l.bias : Float(32, strides=[1], requires_grad=1, device=cuda:0),
      %convs.1.lin_l.weight : Float(32, 32, strides=[32, 1], requires_grad=1, device=cuda:0),
      %convs.1.lin_l.bias : Float(32, strides=[1], requires_grad=1, device=cuda:0),
      %onnx::MatMul_113 : Float(74, 32, strides=[1, 74], requires_grad=0, device=cuda:0),
      %onnx::MatMul_116 : Float(32, 32, strides=[1, 32], requires_grad=0, device=cuda:0)):
  %/convs.0/Constant_output_0 : Long(device=cpu) = onnx::Constant[value={0}, onnx_name="/convs.0/Constant"](), scope: __main__.GraphSAGE::/torch_geometric.nn.conv.sage_conv.SAGEConv::convs.0
  %/convs.0/Constant_1_output_0 : Long(device=cpu) = onnx::Constant[value={1}, onnx_name="/convs

  if return_hidden:


## Deploying and Starting the Triton Server

Before we deploy models to Triton Inference server, make sure to create a model_repository folder in the following structure:

```
gnn_model/ 
    ├── 1/ 
        │ └── gnn_model.onnx # ONNX model file for GNN 
    └── config.pbtxt # Triton configuration file for GNN model
xgboost/ 
    ├── 1/ 
        │ └── xgboost.json # Model file for XGBoost (in JSON format) └
    ── config.pbtxt # Triton configuration file for XGBoost model


```

Explore more about Trition on GitHub repo [here](https://github.com/triton-inference-server)

In [22]:
import os

In [23]:
# Create the model repository directory. The name of this directory is arbitrary.
REPO_PATH = os.path.abspath('model_repository')
os.makedirs(REPO_PATH, exist_ok=True)

In [24]:
TRITON_IMAGE = 'nvcr.io/nvidia/tritonserver:24.10-py3'

In [25]:
!docker pull {TRITON_IMAGE}


24.10-py3: Pulling from nvidia/tritonserver
Digest: sha256:48f5247728bbcf290bea0dbdf9ccc9d803f33fc472307167ce612ae9bab9b870
Status: Image is up to date for nvcr.io/nvidia/tritonserver:24.10-py3
nvcr.io/nvidia/tritonserver:24.10-py3


In [37]:
!docker stop tritonserver
!docker rm tritonserver

tritonserver
tritonserver


In [38]:
!docker run --gpus all -d -p 8000:8000 -p 8001:8001 -p 8002:8002 -v {REPO_PATH}:/models --name tritonserver {TRITON_IMAGE} tritonserver --model-repository=/models


61e77d2b258ec5b53342fcce049e82b1d6cc869d0de4ec79b60d8d4ca1a39209


In [39]:
import time
import tritonclient.grpc as triton_grpc
import tritonclient.http as httpclient

from tritonclient import utils as triton_utils
HOST = 'localhost'
PORT = 8000
TIMEOUT = 60

In [40]:
client_grpc = triton_grpc.InferenceServerClient(url=f'{HOST}:{8001}')
client_http = httpclient.InferenceServerClient(url=f'{HOST}:{PORT}')


In [41]:
# Wait for server to come online
server_start = time.time()
while True:
    try:
        if client_grpc.is_server_ready() or time.time() - server_start > TIMEOUT:
            break
    except triton_utils.InferenceServerException:
        pass
    time.sleep(1)



In [42]:
!docker logs tritonserver



== Triton Inference Server ==

NVIDIA Release 24.10 (build 117849258)
Triton Server Version 2.51.0

Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

  Using driver version 555.58.02 which has support for CUDA 12.5.  This container
  was built with CUDA 12.6 and will be run in Minor Version Compatibility mode.
  CUDA Forward Compatibility is preferred over Minor Version Compatibility for use
  with this container but was unavailable:
  [[System has unsupported display driver / cuda driver combination (CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) cuInit()=803]]
  See https://docs.nvidia.com/deploy/cuda-compatibility

## Loading and Preprocessing Test Data

We are loading the test dataset created in the data preprocessing notebook, applying transformations using the preprocessor.pkl file, and passing the test dataset to the GNN ONNX model deployed in Triton for node embeddings.

In [43]:
#Read untransformed data
dataset_base_path = '../data/TabFormer'
pd.set_option('future.no_silent_downcasting', True)    
untransformed_df = pd.read_csv('../data/TabFormer/xgb/untransformed_test.csv')
untransformed_df.head(5)

Unnamed: 0,Amount,Errors,Card,Chip,City,Zip,MCC,Merchant,Fraud
0,488.0,XX,6149,Chip Transaction,Rome,0.0,3684,-7807051024009846392,1
1,14.22,XX,6149,Chip Transaction,Rome,0.0,5311,9057735476014445185,1
2,13.79,XX,6149,Chip Transaction,Rome,0.0,4214,6098563624419731578,1
3,12.27,XX,6149,Chip Transaction,Rome,0.0,5812,7069584154815291371,1
4,38.63,XX,6149,Chip Transaction,Rome,0.0,5921,3017176960763408508,1


In [44]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(untransformed_df.loc[:, untransformed_df.columns[:-1]])

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Convert data to torch tensors
X = torch.tensor(transformed_data).to(torch.float32).to(device)
y = torch.tensor(untransformed_df[untransformed_df.columns[-1]].values ).to(torch.long).to(device)
edge_index = torch.tensor([[], []], dtype=torch.int64)


## Submitting inference requests to models deployed in Triton

- Create inputs for the ONNX model.
- Send an inference request to the ONNX model.
- Send the output of the ONNX model as an input to the XGBoost model for classification.

In [46]:
#converting data to numpy format for inference
X_numpy = X.detach().cpu().numpy()  # Converting to NumPy if it's a PyTorch tensor
edge_index_numpy = edge_index.detach().cpu().numpy() 

inputs_features = httpclient.InferInput("l_x_", X_numpy.shape, datatype="FP32")
inputs_features.set_data_from_numpy(X_numpy)

inputs_edges = httpclient.InferInput("l_edge_index_", edge_index_numpy.shape, datatype="INT64")
inputs_edges.set_data_from_numpy(edge_index_numpy)

outputs = httpclient.InferRequestedOutput("output")

In [47]:
# Querying the server
results = client_http.infer(model_name="model", inputs=[inputs_features, inputs_edges], outputs=[outputs])
node_embeddings = results.as_numpy('output')
# print(node_embeddings)



In [48]:
inputs_xgboost = httpclient.InferInput("input__0", node_embeddings.shape, datatype="FP32")
inputs_xgboost.set_data_from_numpy(node_embeddings)

outputs_xgboost = httpclient.InferRequestedOutput("output__0")

In [50]:
import numpy
results = client_http.infer(model_name="xgboost", inputs=[inputs_xgboost], outputs=[outputs_xgboost])
predictions = results.as_numpy('output__0')

print(predictions)
print(y)


[0.3322781  0.499941   0.47168913 ... 0.01088735 0.87886477 0.01088735]
tensor([1, 1, 1,  ..., 0, 0, 0], device='cuda:0')


In [52]:
from cuml.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import cupy as cp
from torch.utils.dlpack import to_dlpack

labels = y
pred_labels = (predictions > 0.5).astype(int)

# Move labels to CPU for evaluation
labels_cpu = labels.cpu().numpy()

# Compute evaluation metrics
accuracy = accuracy_score(labels_cpu, pred_labels)
precision = precision_score(labels_cpu, pred_labels, zero_division=0)
recall = recall_score(labels_cpu, pred_labels, zero_division=0)
f1 = f1_score(labels_cpu, pred_labels, zero_division=0)

print(f"Performance of XGBoost model trained on node embeddings")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

conf_mat = confusion_matrix(labels.cpu().numpy(), pred_labels)
print('Confusion Matrix:', conf_mat)

Performance of XGBoost model trained on node embeddings
Accuracy: 0.8766
Precision: 0.2643
Recall: 0.2928
F1 Score: 0.2778
Confusion Matrix: [[21956  1701]
 [ 1476   611]]
