<a href="https://colab.research.google.com/github/remzicam/xs_blenderbot_onnx/blob/main/inference_speed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/remzicam/xs_blenderbot_onnx
%cd xs_blenderbot_onnx
!pip install -r requirements.txt

In [2]:
from time import perf_counter

In [3]:
max_answer_length = 100
utterance = "Hello, how are you?"

In [4]:
hf_load_start=perf_counter()
from transformers import (BlenderbotSmallForConditionalGeneration,
                          BlenderbotSmallTokenizer)

class HFTextGenerationPipeline:
    """Pipeline for text generation of blenderbot model.
    Returns:
        str: generated text
    """

    # load tokenizer and the model
    model_name = "facebook/blenderbot_small-90M"
    tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_name)
    model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_name)

    def __init__(self, **kwargs):
        """Specififying text generation parameters.
        For example: max_length=100 which generates text shorter than
        100 tokens. Visit:
        https://huggingface.co/docs/transformers/main_classes/text_generation
        for more parameters
        """
        self.__dict__.update(kwargs)

    def preprocess(self, text) -> str:
        """Tokenizes input text.
        Args:
            text (str): user specified text
        Returns:
            torch.Tensor (obj): text representation as tensors
        """
        return self.tokenizer(text, return_tensors="pt")

    def postprocess(self, outputs) -> str:
        """Converts tensors into text.
        Args:
            outputs (torch.Tensor obj): model text generation output
        Returns:
            str: generated text
        """
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def __call__(self, text: str) -> str:
        """Generates text from input text.
        Args:
            text (str): user specified text
        Returns:
            str: generated text
        """
        tokenized_text = self.preprocess(text)
        output = self.model.generate(**tokenized_text, **self.__dict__)
        return self.postprocess(output)

hf_pipe= HFTextGenerationPipeline(max_length=max_answer_length)
perf_counter()-hf_load_start

Downloading:   0%|          | 0.00/964k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/345k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/205 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350M [00:00<?, ?B/s]

27.402704199

In [5]:
onnx_load_start=perf_counter()
from blender_model import TextGenerationPipeline
onnx_pipe = TextGenerationPipeline(max_length=max_answer_length)
perf_counter()-onnx_load_start


Downloading:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/61.4M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.6M [00:00<?, ?B/s]

16.96621161799999

In [6]:
def latency_calculator(pipe, text):
  start_time=perf_counter()
  pipe(text)
  return perf_counter()-start_time

In [7]:
latency_calculator(hf_pipe, utterance)

3.4276989420000064

In [8]:
latency_calculator(onnx_pipe, utterance)

1.1976213750000113