In [1]:
import tensorrt as trt
import os
import torch
from torch.nn import functional as F

import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np

from transformers import BertTokenizer, BertForMaskedLM

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)

[02/17/2024-21:02:33] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


In [4]:
runtime = trt.Runtime(TRT_LOGGER)
with open('./engine.trt', 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)

In [5]:
# Create execution context as shown below
bert_context = engine.create_execution_context()


[02/17/2024-21:02:39] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


In [18]:
'''
inputs
text = "The capital of France, " + tokenizer.mask_token + ", contains the Eiffel Tower."
    # input_ids are the indices corresponding to each token in the sentence.
    # attention_mask indicates whether a token should be attended to or not.
    # token_type_ids identifies which sequence a token belongs to when there is more than one sequence.
'''
# input_ids = numpy array ( size: batch X seq_len) ex: (1 X 30 )
# token_type_ids = numpy array ( size: batch X seq_len) ex: (1 X 30 )
# attention_mask = numpy array ( size: batch X seq_len) ex: (1 X 30 )
BERT_PATH = 'bert-base-uncased'
npz_file = BERT_PATH + '/case_data.npz'
data = np.load(npz_file)
input_ids = data['input_ids']
token_type_ids = data['token_type_ids']
position_ids = data['position_ids']
attention_mask = data['attention_mask']
print(data['input_ids'])

tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

'''
outputs
'''
bert_output = torch.zeros((1, 16),device=device).cpu().detach().numpy()

[[  101  1996  3007  1997  2605  1010   103  1010  3397  1996  1041 13355
   2884  3578  1012   102]]


In [12]:
# Allocate memory for the inputs and outputs in GPU
batch_size = 1

'''
memory allocation for inputs
'''
d_input_ids = cuda.mem_alloc(batch_size * input_ids.nbytes)
d_token_type_ids = cuda.mem_alloc(batch_size * token_type_ids.nbytes)
d_attention_mask = cuda.mem_alloc(batch_size * attention_mask.nbytes)

'''
memory allocation for outputs
'''
d_output = cuda.mem_alloc(batch_size * bert_output.nbytes)



In [13]:
# Create bindings array
bindings = [int(d_input_ids), int(d_token_type_ids), int(d_attention_mask), int(d_output)]

# Create stream and transfer inputs to GPU (can be sync or async ). ‘async ’ shown here.
stream = cuda.Stream()# Transfer input data from python buffers to device(GPU)
cuda.memcpy_htod_async(d_input_ids, input_ids, stream)
cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)
cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)

# Execute using the engine
bert_context.execute_async(batch_size, bindings, stream.handle, None)


[02/17/2024-21:19:35] [TRT] [W] The enqueue() method has been deprecated when used with engines built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. Please use enqueueV2() instead.
[02/17/2024-21:19:35] [TRT] [W] Also, the batchSize argument passed into this function has no effect on changing the input shapes. Please use setBindingDimensions() function to change input shapes instead.


  bert_context.execute_async(batch_size, bindings, stream.handle, None)


True

In [14]:
# Transfer output back from GPU to python buffer variable
cuda.memcpy_dtoh_async(bert_output, d_output, stream)
stream.synchronize()

# Now the bert_output variable in which we stored zeros will have the prediction.


In [32]:
# Run softmax and get the most probable class
pred = torch.tensor(bert_output)
pred_output_softmax = F.softmax(pred, dim=-1)
mask_index = np.where(input_ids[0] == tokenizer.mask_token_id)[0][0]
mask_word = pred_output_softmax[0, mask_index, :]
top_10 = torch.topk(mask_word, 10, dim=1)[1][0]
print("model test topk10 output:")
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)
_, predicted = torch.max(pred_output_softmax, 1)


IndexError: too many indices for tensor of dimension 2

In [34]:
pred_output_softmax

tensor([[0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625,
         0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]])

In [35]:
mask_index

6