In [77]:
import pandas as pd
from datasets import load_dataset
import sys

data = load_dataset("Sansh2003/subtask-b-examples-test")
data = data['test']
data = pd.DataFrame(data)

In [2]:
data.head()

Unnamed: 0,text,label,model,id
0,Decision-making is one of life's most difficul...,4,bloomz,0
1,"When you think of voting for president, do you...",0,human,1
2,Driverless Cars - Limitations & Potential Draw...,4,bloomz,2
3,There are multiple benefits to attending high ...,5,dolly,3
4,"The Electoral College, also known as the Presi...",5,dolly,4


In [3]:
MODEL_PATH = "Sansh2003/roberta-large-merged-subtaskB"
id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels = len(id2label), id2label=id2label, label2id=label2id)

model.eval()
model.to(device)



RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [55]:
output_dir = os.path.join(".", "onnx_models")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)   
export_model_path = os.path.join(output_dir, 'roberta-large-subtaskB-onnx.onnx')

import torch
device = torch.device("cpu")

def preprocess_single_text(text, tokenizer):
    inputs = tokenizer(text, truncation=True, max_length=512, padding=True, return_tensors="pt")
    return inputs

# Get the first example data to run the model and export it to ONNX
x = preprocess_single_text(data.iloc[0]['text'], tokenizer)
inputs = {
    'input_ids':      x['input_ids'].to(device),
    'attention_mask': x['attention_mask'].to(device),
}

# Set model to inference mode, which is required before exporting the model because some operators behave differently in 
# inference and training mode.
model.eval()
model.to(device)

if not os.path.exists(export_model_path):
    with torch.no_grad():
        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
        torch.onnx.export(model,                                            # model being run
                          args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
                          f=export_model_path,                              # where to save the model (can be a file or file-like object)
                          opset_version=11,                                 # the ONNX version to export the model to
                          do_constant_folding=True,                         # whether to execute constant folding for optimization
                          input_names=['input_ids',                         # the model's input names
                                       'input_mask'],
                          output_names=['class'],                           # the model's output names
                          dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                        'input_mask' : symbolic_names,
                                        'start' : symbolic_names})
        print("Model exported at ", export_model_path)



Model exported at  ./onnx_models/roberta-large-subtaskB-onnx.onnx


In [56]:
import time

# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.
latency = []

x = preprocess_single_text(data.iloc[i]['text'], tokenizer)
total_samples = 100
with torch.no_grad():
    for i in range(total_samples):
        x = preprocess_single_text(data.iloc[i]['text'], tokenizer)
        inputs = {
            'input_ids':      x['input_ids'].to(device),
            'attention_mask': x['attention_mask'].to(device),
        }
        start = time.time()
        outputs = model(**inputs)
        latency.append(time.time() - start)
print("PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

PyTorch cpu Inference time = 654.46 ms


In [68]:
import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()
sess_options.optimized_model_filepath = os.path.join(output_dir, "optimized_model_cpu.onnx")
sess_options.intra_op_num_threads = 4  # Number of threads for intra-op parallelism
sess_options.inter_op_num_threads = 1  # Number of threads for inter-op parallelism

# Specify providers when you use onnxruntime-gpu for CPU inference.
session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])

latency = []
for i in range(total_samples):
    x = preprocess_single_text(data.iloc[i]['text'], tokenizer)
    inputs = {
        'input_ids':      x['input_ids'].to(device).numpy(),
        'input_mask': x['attention_mask'].to(device).numpy(),
    }
    start = time.time()
    ort_outputs = session.run(None, inputs)
    latency.append(time.time() - start)
print("OnnxRuntime cpu Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

2025-01-16 20:42:49.499093083 [W:onnxruntime:, inference_session.cc:1732 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


OnnxRuntime cpu Inference time = 673.88 ms


In [71]:
print("***** Verifying correctness *****")
for i in range(2):
    print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[0][0][i], outputs[0][0][i].cpu(), rtol=1e-05, atol=1e-04))

***** Verifying correctness *****
PyTorch and ONNX Runtime output 0 are close: True
PyTorch and ONNX Runtime output 1 are close: True


In [105]:
logits = ort_outputs[0]

probs = np.exp(logits.squeeze()) / np.sum(np.exp(logits.squeeze()), axis=-1)  # Apply softmax to get probabilities
prob_dict = {}

for i, k in enumerate(label2id.keys()):
    prob_dict[k] = probs[i]

prob_dict = {k: float(v) for k, v in sorted(prob_dict.items(), key=lambda item: item[1], reverse=True)}

print(prob_dict)

{'davinci': 0.99892258644104, 'chatGPT': 0.0010614646598696709, 'dolly': 7.130848644010257e-06, 'human': 4.238926976540824e-06, 'cohere': 3.654642341643921e-06, 'bloomz': 9.734809509609477e-07}


### GPU:

In [75]:
output_dir = os.path.join(".", "onnx_models")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)   
export_model_path = os.path.join(output_dir, 'roberta-large-subtaskB-onnx-gpu.onnx')

import torch
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
print(device)

# Get the first example data to run the model and export it to ONNX
x = preprocess_single_text(data.iloc[0]['text'], tokenizer)
inputs = {
    'input_ids':      x['input_ids'].to(device),
    'input_mask': x['attention_mask'].to(device),
}


# Set model to inference mode, which is required before exporting the model because some operators behave differently in 
# inference and training mode.
model.eval()
model.to(device)

if not os.path.exists(export_model_path):
    with torch.no_grad():
        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
        torch.onnx.export(model,                                            # model being run
                          args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
                          f=export_model_path,                              # where to save the model (can be a file or file-like object)
                          opset_version=11,                      # the ONNX version to export the model to
                          do_constant_folding=True,                         # whether to execute constant folding for optimization
                          input_names=['input_ids',                         # the model's input names
                                       'input_mask'],
                          output_names=['class'],                           # the model's output names
                          dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                        'input_mask' : symbolic_names,
                                        'class' : symbolic_names,
                                        })
        print("Model exported at ", export_model_path)

cuda
Model exported at  ./onnx_models/roberta-large-subtaskB-onnx-gpu.onnx


In [81]:
export_model_path

'./onnx_models/roberta-large-subtaskB-onnx-gpu.onnx'

In [82]:
optimized_fp16_model_path

'./onnx_models/roberta-large-subtaskB-onnx_gpu_fp16.onnx'

In [84]:
optimized_fp16_model_path = './onnx_models/roberta-large-subtaskB-onnx_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')
!python -m onnxruntime.transformers.optimizer --input './onnx_models/roberta-large-subtaskB-onnx-gpu.onnx' --output './onnx_models/roberta-large-subtaskB-onnx_gpu_fp16.onnx' --float16 --use_gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-01-16 21:42:44.838328055 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 31463, index: 0, mask: {4, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2025-01-16 21:42:44.838333611 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 31467, index: 4, mask: {6, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2025-01-16 21:42:44.838338207 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 31466, index: 3, mask: {10, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2025-01-16 21:42:44.838335196 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 31464, index: 1, mask: {8, }, error code: 22 error msg: Invalid argument. Specify the number o