In [None]:
from transformers import T5ForConditionalGeneration

t5_cpu = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask", resume_download=True).eval()
t5_cuda = t5_cpu.cuda()

In [None]:
import torch
import torch.utils.benchmark as benchmark
import os

num_threads = 16
os.environ["OMP_NUM_THREADS"] = str(num_threads)
os.environ["MKL_NUM_THREADS"] = str(num_threads)

torch.set_num_threads(num_threads)

input_cpu = torch.randint(high=30000, size=(1, 32), dtype=torch.int64)
input_cuda = torch.randint(high=30000, size=(1, 32), dtype=torch.int64).cuda()

t5_cpu.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=20)
t5_cuda.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import onnxruntime as ort

sess_options = ort.SessionOptions()

sess_options.intra_op_num_threads = num_threads
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

t5_ort_cpu = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                   export=True,
                                                   provider="CPUExecutionProvider",
                                                  session_options=sess_options)

t5_ort_cuda = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                   from_transformers=True,
                                                   provider="CUDAExecutionProvider")
t5_ort_cpu.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=20)
t5_ort_cuda.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from eet.transformers.modeling_t5 import EETT5ForConditionalGeneration

t5_eet = EETT5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask", 1)
t5_eet.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=1)

In [None]:
import lightseq.inference as lsi

t5_ls = lsi.T5("lightseq_t5_base.hdf5", 1)

In [None]:
t5_ls.infer(input_cpu.numpy())

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM

t5_ort_trt = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                      from_transformers=True,
                                      provider="TensorrtExecutionProvider")
t5_ort_trt.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import onnxruntime as ort

options = ort.SessionOptions()
options.intra_op_num_threads = num_threads
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL  # https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#other-configuration-settings

t5_ort_openvino = ORTModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                       export=True,
                                                       provider="OpenVINOExecutionProvider",
                                                       session_options=options,
                                                       provider_options={"num_of_threads": num_threads})
t5_ort_openvino.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=20)

In [None]:
from optimum.intel.openvino import OVModelForSeq2SeqLM

t5_openvino = OVModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-multitask",
                                                   export=True, use_cache=True, compile=True, ov_config={"INFERENCE_NUM_THREADS": num_threads})
t5_openvino.generate(input_cpu, do_sample=True, num_beams=2, max_new_tokens=20)

In [None]:
# For CPU benchmark.

from tqdm.auto import tqdm
from timeit import default_timer
import os

num_runs = 100

results = {}
for seq_len in [8, 32, 64]:
    print(f"Sequence length: {seq_len}")
    input_cpu = torch.randint(high=30000, size=(1, seq_len), dtype=torch.int64)
    for model, description in zip([t5_cpu, t5_ort_cpu, t5_ort_openvino, t5_openvino],
                                  ["PyTorch (CPU)", "ORT (CPU)", "ORT (OpenVINO)", "Optimum OpenVINO"]):
        model.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=1)  # warmup
        start_time = default_timer()
        for i in range(num_runs):
            model.generate(input_cpu, do_sample=True, num_beams=4, max_new_tokens=1)
        print(f"{description}\t{(default_timer() - start_time) / num_runs}") 

In [None]:
# For GPU benchmark.

from tqdm.auto import tqdm
from timeit import default_timer
import os

num_runs = 100

results = {}
for seq_len in [8, 32, 64]:
    print(f"Sequence length: {seq_len}")
    input_cpu = torch.randint(high=30000, size=(1, seq_len), dtype=torch.int64)
    input_cuda = torch.randint(high=30000, size=(1, seq_len), dtype=torch.int64, device="cuda")
    for model, description in zip([t5_cuda, t5_eet, t5_ls], #t5_ort_cuda],
                                  ["PyTorch (CUDA)", "EET", "LightSeq"]): #"ORT (CUDA)"]):
        if description == "LightSeq":
            model.infer(input_cpu.numpy())
        else:
            model.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=1)  # warmup
        torch.cuda.synchronize()
        start_time = default_timer()
        for i in range(num_runs):
            if description == "LightSeq":
                model.infer(input_cpu.numpy())
            else:
                model.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=1)
        torch.cuda.synchronize()
        print(f"{description}\t{(default_timer() - start_time) / num_runs}") 

In [None]:
model.generate(input_cuda, do_sample=True, num_beams=4, max_new_tokens=1)